From a2891ec77c183ec265af8278eee821e4d9715c12 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 7 Oct 2016 10:37:22 -0700 Subject: [PATCH] Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422, UYVYToI420 functions R=fbarchard@google.com BUG=libyuv:634 Performance gains as below, YUY2ToI422, YUY2ToI420 :- YUY2ToYRow_MSA : ~10x YUY2ToUVRow_MSA : ~11x YUY2ToUV422Row_MSA : ~9x YUY2ToYRow_Any_MSA : ~6x YUY2ToUVRow_Any_MSA : ~5x YUY2ToUV422Row_Any_MSA : ~4x UYVYToI422, UYVYToI420 :- UYVYToYRow_MSA : ~10x UYVYToUVRow_MSA : ~11x UYVYToUV422Row_MSA : ~9x UYVYToYRow_Any_MSA : ~6x UYVYToUVRow_Any_MSA : ~5x UYVYToUV422Row_Any_MSA : ~4x Review URL: https://codereview.chromium.org/2397693002 . --- docs/getting_started.md | 6 +- include/libyuv/row.h | 26 ++++++++ source/convert.cc | 20 +++++++ source/planar_functions.cc | 20 +++++++ source/row_any.cc | 16 +++++ source/row_msa.cc | 120 +++++++++++++++++++++++++++++++++++++ 6 files changed, 205 insertions(+), 3 deletions(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 9524d9d6c..a0d50a94e 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -166,12 +166,12 @@ ia32 mipsel gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" - gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" ninja -j7 -v -C out/Debug libyuv_unittest ninja -j7 -v -C out/Release libyuv_unittest - gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" - gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" + gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false" ninja -j7 -v -C out/Debug libyuv_unittest ninja -j7 -v -C out/Release libyuv_unittest diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 01cca3358..1a17cbf4e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -372,6 +372,12 @@ extern "C" { #define HAS_ARGBMIRRORROW_MSA #define HAS_I422TOYUY2ROW_MSA #define HAS_I422TOUYVYROW_MSA +#define HAS_YUY2TOYROW_MSA +#define HAS_YUY2TOUVROW_MSA +#define HAS_YUY2TOUV422ROW_MSA +#define HAS_UYVYTOYROW_MSA +#define HAS_UYVYTOUVROW_MSA + #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1669,6 +1675,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width); +void YUY2ToUVRow_MSA(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUV422Row_MSA(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width); void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int width); @@ -1689,6 +1700,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int width); void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int width); +void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width); +void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int width); +void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width); void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int width); @@ -1709,6 +1725,11 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width); +void UYVYToUVRow_MSA(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int width); +void UYVYToUV422Row_MSA(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width); void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width); void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, @@ -1730,6 +1751,11 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int width); void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int width); +void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width); +void UYVYToUVRow_Any_MSA(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int width); +void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width); void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, diff --git a/source/convert.cc b/source/convert.cc index a33742d24..ed3cd7fd0 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -392,6 +392,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUVRow = YUY2ToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUVRow = YUY2ToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); @@ -457,6 +467,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUVRow = UYVYToUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUVRow = UYVYToUVRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 6fea63079..f9fa212f3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -482,6 +482,16 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, } } #endif +#if defined(HAS_YUY2TOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + YUY2ToYRow = YUY2ToYRow_Any_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + YUY2ToYRow = YUY2ToYRow_MSA; + YUY2ToUV422Row = YUY2ToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); @@ -556,6 +566,16 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, } } #endif +#if defined(HAS_UYVYTOYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + UYVYToYRow = UYVYToYRow_Any_MSA; + UYVYToUV422Row = UYVYToUV422Row_Any_MSA; + if (IS_ALIGNED(width, 32)) { + UYVYToYRow = UYVYToYRow_MSA; + UYVYToUV422Row = UYVYToUV422Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); diff --git a/source/row_any.cc b/source/row_any.cc index 53f232ed7..af9ecc511 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -442,6 +442,12 @@ ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #ifdef HAS_UYVYTOYROW_NEON ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) #endif +#ifdef HAS_YUY2TOYROW_MSA +ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_MSA +ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 0, 2, 1, 31) +#endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif @@ -763,6 +769,10 @@ ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif +#ifdef HAS_YUY2TOUV422ROW_MSA +ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) +#endif #undef ANY12 // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. @@ -848,6 +858,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #ifdef HAS_UYVYTOUVROW_NEON ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif +#ifdef HAS_YUY2TOUVROW_MSA +ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) +#endif +#ifdef HAS_UYVYTOUVROW_MSA +ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) +#endif #undef ANY12S #ifdef __cplusplus diff --git a/source/row_msa.cc b/source/row_msa.cc index 52a246cdb..acc60520a 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -101,6 +101,126 @@ void I422ToUYVYRow_MSA(const uint8* src_y, } } +void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); + dst1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_yuy2 += 64; + dst_y += 32; + } +} + +void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); + src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); + src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2); + src2 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4); + src3 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0); + dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + src_yuy2_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_yuy2, 16, src0, src1, src2, src3); + src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); + src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2); + dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); + dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_yuy2 += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + dst0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); + dst1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2); + ST_UB2(dst0, dst1, dst_y, 16); + src_uyvy += 64; + dst_y += 32; + } +} + +void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + const uint8 *src_uyvy_next = src_uyvy + src_stride_uyvy; + int x; + v16u8 src0, src1, src2, src3, src4, src5, src6, src7; + v16u8 vec0, vec1, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); + src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); + src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2); + src2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4); + src3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6); + vec0 = __msa_aver_u_b(src0, src2); + vec1 = __msa_aver_u_b(src1, src3); + dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0); + dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + src_uyvy_next += 64; + dst_u += 16; + dst_v += 16; + } +} + +void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + LD_UB4(src_uyvy, 16, src0, src1, src2, src3); + src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); + src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2); + dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0); + dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0); + ST_UB(dst0, dst_u); + ST_UB(dst1, dst_v); + src_uyvy += 64; + dst_u += 16; + dst_v += 16; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv