From 7018f5be0f98419c0e4eac518b0641655ba91c98 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 3 Oct 2016 18:21:31 -0700 Subject: [PATCH] Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions R=fbarchard@google.com BUG=libyuv:634 Performance gains :- I422ToYUY2Row_MSA - ~12x I422ToYUY2Row_Any_MSA - ~7x I422ToUYVYRow_MSA - ~12x I422ToUYVYRow_Any_MSA - ~7x Review URL: https://codereview.chromium.org/2378753004 . --- include/libyuv/macros_msa.h | 13 ++++++++++ include/libyuv/row.h | 18 ++++++++++++++ source/convert_from.cc | 24 +++++++++++++++++++ source/convert_from_argb.cc | 16 +++++++++++++ source/row_any.cc | 6 +++++ source/row_msa.cc | 48 +++++++++++++++++++++++++++++++++++++ 6 files changed, 125 insertions(+) diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h index 92ed21c38..8a81e8213 100644 --- a/include/libyuv/macros_msa.h +++ b/include/libyuv/macros_msa.h @@ -71,6 +71,19 @@ } #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) +/* Description : Interleave both left and right half of input vectors + Arguments : Inputs - in0, in1 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of byte elements from 'in0' and 'in1' are + interleaved and written to 'out0' +*/ +#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \ + out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \ +} +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) + #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ #endif // INCLUDE_LIBYUV_MACROS_MSA_H_ diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7bbad513c..01cca3358 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -370,6 +370,8 @@ extern "C" { #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_MIRRORROW_MSA #define HAS_ARGBMIRRORROW_MSA +#define HAS_I422TOYUY2ROW_MSA +#define HAS_I422TOUYVYROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1769,6 +1771,22 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_uyvy, int width); +void I422ToYUY2Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); +void I422ToYUY2Row_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width); +void I422ToUYVYRow_Any_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width); // Effects related row functions. void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/source/convert_from.cc b/source/convert_from.cc index 3b2dca816..1256ca99c 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -237,6 +237,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); @@ -298,6 +306,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); @@ -345,6 +361,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 2a8682b7e..50ede22a6 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -553,6 +553,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOYUY2ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToYUY2Row = I422ToYUY2Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToYUY2Row = I422ToYUY2Row_MSA; + } + } +#endif { // Allocate a rows of yuv. @@ -655,6 +663,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_I422TOUYVYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + I422ToUYVYRow = I422ToUYVYRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + I422ToUYVYRow = I422ToUYVYRow_MSA; + } + } +#endif { // Allocate a rows of yuv. diff --git a/source/row_any.cc b/source/row_any.cc index 1f1b36348..df442b0f6 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -80,9 +80,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOYUY2ROW_MSA +ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif +#ifdef HAS_I422TOUYVYROW_MSA +ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) +#endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif diff --git a/source/row_msa.cc b/source/row_msa.cc index b86865cf3..52a246cdb 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -53,6 +53,54 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) { } } +void I422ToYUY2Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); + ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); + ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_yuy2 += 64; + } +} + +void I422ToUYVYRow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, + int width) { + int x; + v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; + v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; + + for (x = 0; x < width; x += 32) { + src_u0 = LD_UB(src_u); + src_v0 = LD_UB(src_v); + LD_UB2(src_y, 16, src_y0, src_y1); + ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); + ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); + ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); + ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); + src_u += 16; + src_v += 16; + src_y += 32; + dst_uyvy += 64; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv