mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422, UYVYToI420 functions
R=fbarchard@google.com BUG=libyuv:634 Performance gains as below, YUY2ToI422, YUY2ToI420 :- YUY2ToYRow_MSA : ~10x YUY2ToUVRow_MSA : ~11x YUY2ToUV422Row_MSA : ~9x YUY2ToYRow_Any_MSA : ~6x YUY2ToUVRow_Any_MSA : ~5x YUY2ToUV422Row_Any_MSA : ~4x UYVYToI422, UYVYToI420 :- UYVYToYRow_MSA : ~10x UYVYToUVRow_MSA : ~11x UYVYToUV422Row_MSA : ~9x UYVYToYRow_Any_MSA : ~6x UYVYToUVRow_Any_MSA : ~5x UYVYToUV422Row_Any_MSA : ~4x Review URL: https://codereview.chromium.org/2397693002 .
This commit is contained in:
parent
3b88a19ab1
commit
a2891ec77c
@ -166,12 +166,12 @@ ia32
|
||||
mipsel
|
||||
|
||||
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
ninja -j7 -v -C out/Debug libyuv_unittest
|
||||
ninja -j7 -v -C out/Release libyuv_unittest
|
||||
|
||||
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
|
||||
ninja -j7 -v -C out/Debug libyuv_unittest
|
||||
ninja -j7 -v -C out/Release libyuv_unittest
|
||||
|
||||
|
||||
@ -372,6 +372,12 @@ extern "C" {
|
||||
#define HAS_ARGBMIRRORROW_MSA
|
||||
#define HAS_I422TOYUY2ROW_MSA
|
||||
#define HAS_I422TOUYVYROW_MSA
|
||||
#define HAS_YUY2TOYROW_MSA
|
||||
#define HAS_YUY2TOUVROW_MSA
|
||||
#define HAS_YUY2TOUV422ROW_MSA
|
||||
#define HAS_UYVYTOYROW_MSA
|
||||
#define HAS_UYVYTOUVROW_MSA
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
|
||||
@ -1669,6 +1675,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
|
||||
void YUY2ToUVRow_MSA(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
|
||||
void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
@ -1689,6 +1700,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
|
||||
void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
|
||||
void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
@ -1709,6 +1725,11 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToUV422Row_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
|
||||
void UYVYToUVRow_MSA(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToUV422Row_MSA(const uint8* src_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
|
||||
void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
|
||||
@ -1730,6 +1751,11 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
|
||||
void UYVYToUVRow_Any_MSA(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void I422ToYUY2Row_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
|
||||
@ -392,6 +392,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YUY2TOYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
YUY2ToYRow = YUY2ToYRow_Any_MSA;
|
||||
YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
YUY2ToYRow = YUY2ToYRow_MSA;
|
||||
YUY2ToUVRow = YUY2ToUVRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
|
||||
@ -457,6 +467,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_UYVYTOYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
UYVYToYRow = UYVYToYRow_Any_MSA;
|
||||
UYVYToUVRow = UYVYToUVRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
UYVYToYRow = UYVYToYRow_MSA;
|
||||
UYVYToUVRow = UYVYToUVRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
|
||||
|
||||
@ -482,6 +482,16 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YUY2TOYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
YUY2ToYRow = YUY2ToYRow_Any_MSA;
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
YUY2ToYRow = YUY2ToYRow_MSA;
|
||||
YUY2ToUV422Row = YUY2ToUV422Row_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
|
||||
@ -556,6 +566,16 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_UYVYTOYROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
UYVYToYRow = UYVYToYRow_Any_MSA;
|
||||
UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
UYVYToYRow = UYVYToYRow_MSA;
|
||||
UYVYToUV422Row = UYVYToUV422Row_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
|
||||
|
||||
@ -442,6 +442,12 @@ ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
|
||||
#ifdef HAS_UYVYTOYROW_NEON
|
||||
ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOYROW_MSA
|
||||
ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_UYVYTOYROW_MSA
|
||||
ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 0, 2, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOARGBROW_NEON
|
||||
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
|
||||
#endif
|
||||
@ -763,6 +769,10 @@ ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
|
||||
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
|
||||
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUV422ROW_MSA
|
||||
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
|
||||
ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
|
||||
#endif
|
||||
#undef ANY12
|
||||
|
||||
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
|
||||
@ -848,6 +858,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
|
||||
#ifdef HAS_UYVYTOUVROW_NEON
|
||||
ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOUVROW_MSA
|
||||
ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_UYVYTOUVROW_MSA
|
||||
ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
|
||||
#endif
|
||||
#undef ANY12S
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -101,6 +101,126 @@ void I422ToUYVYRow_MSA(const uint8* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
|
||||
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
|
||||
dst1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
|
||||
ST_UB2(dst0, dst1, dst_y, 16);
|
||||
src_yuy2 += 64;
|
||||
dst_y += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 vec0, vec1, dst0, dst1;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
|
||||
LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
|
||||
src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
|
||||
src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
|
||||
src2 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
|
||||
src3 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
|
||||
vec0 = __msa_aver_u_b(src0, src2);
|
||||
vec1 = __msa_aver_u_b(src1, src3);
|
||||
dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
|
||||
dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
|
||||
ST_UB(dst0, dst_u);
|
||||
ST_UB(dst1, dst_v);
|
||||
src_yuy2 += 64;
|
||||
src_yuy2_next += 64;
|
||||
dst_u += 16;
|
||||
dst_v += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
|
||||
src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
|
||||
src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
|
||||
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
|
||||
dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
|
||||
ST_UB(dst0, dst_u);
|
||||
ST_UB(dst1, dst_v);
|
||||
src_yuy2 += 64;
|
||||
dst_u += 16;
|
||||
dst_v += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
|
||||
dst0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
|
||||
dst1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
|
||||
ST_UB2(dst0, dst1, dst_y, 16);
|
||||
src_uyvy += 64;
|
||||
dst_y += 32;
|
||||
}
|
||||
}
|
||||
|
||||
void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
const uint8 *src_uyvy_next = src_uyvy + src_stride_uyvy;
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 vec0, vec1, dst0, dst1;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
|
||||
LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
|
||||
src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
|
||||
src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
|
||||
src2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
|
||||
src3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
|
||||
vec0 = __msa_aver_u_b(src0, src2);
|
||||
vec1 = __msa_aver_u_b(src1, src3);
|
||||
dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
|
||||
dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
|
||||
ST_UB(dst0, dst_u);
|
||||
ST_UB(dst1, dst_v);
|
||||
src_uyvy += 64;
|
||||
src_uyvy_next += 64;
|
||||
dst_u += 16;
|
||||
dst_v += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
int x;
|
||||
v16u8 src0, src1, src2, src3, dst0, dst1;
|
||||
|
||||
for (x = 0; x < width; x += 32) {
|
||||
LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
|
||||
src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
|
||||
src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
|
||||
dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
|
||||
dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
|
||||
ST_UB(dst0, dst_u);
|
||||
ST_UB(dst1, dst_v);
|
||||
src_uyvy += 64;
|
||||
dst_u += 16;
|
||||
dst_v += 16;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user