diff --git a/Android.mk b/Android.mk index 3988cb969..983ee1d85 100644 --- a/Android.mk +++ b/Android.mk @@ -29,7 +29,7 @@ LOCAL_SRC_FILES := \ source/row_common.cc \ source/row_mips.cc \ source/row_neon64.cc \ - source/row_gcc.cc \ + source/row_gcc.cc \ source/scale.cc \ source/scale_any.cc \ source/scale_argb.cc \ @@ -56,7 +56,8 @@ endif ifeq ($(TARGET_ARCH_ABI),mips) LOCAL_CFLAGS += -DLIBYUV_MSA LOCAL_SRC_FILES += \ - source/row_msa.cc + source/row_msa.cc \ + source/scale_msa.cc endif LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include diff --git a/BUILD.gn b/BUILD.gn index 04bf80fff..c63faf5c5 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -164,6 +164,7 @@ if (libyuv_use_msa) { sources = [ # MSA Source Files "source/row_msa.cc", + "source/scale_msa.cc", ] public_configs = [ ":libyuv_config" ] diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b7d2ab1b..2d8f5224e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,6 +50,7 @@ set(ly_source_files ${ly_src_dir}/scale_argb.cc ${ly_src_dir}/scale_common.cc ${ly_src_dir}/scale_mips.cc + ${ly_src_dir}/scale_msa.cc ${ly_src_dir}/scale_neon.cc ${ly_src_dir}/scale_neon64.cc ${ly_src_dir}/scale_gcc.cc @@ -130,13 +131,13 @@ if(TEST) if (JPEG_FOUND) target_link_libraries(libyuv_unittest ${JPEG_LIBRARY}) endif() - + if(NACL AND NACL_LIBC STREQUAL "newlib") target_link_libraries(libyuv_unittest glibc-compat) endif() target_link_libraries(libyuv_unittest gflags) - + endif() install(TARGETS ${ly_lib_name} DESTINATION lib) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index fcdeaf06f..24471920e 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -103,6 +103,11 @@ extern "C" { #define HAS_SCALEROWDOWN38_DSPR2 #endif +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#define HAS_SCALEARGBROWDOWN2_MSA +#define HAS_SCALEARGBROWDOWNEVEN_MSA +#endif + // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, @@ -562,6 +567,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, @@ -586,6 +603,18 @@ void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); +void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, @@ -607,6 +636,16 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stepx, uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width); void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -627,6 +666,16 @@ void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb, int src_stepx, uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_MSA(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. diff --git a/libyuv.gypi b/libyuv.gypi index 4f68a065c..218d25d93 100644 --- a/libyuv.gypi +++ b/libyuv.gypi @@ -72,6 +72,7 @@ 'source/scale_common.cc', 'source/scale_gcc.cc', 'source/scale_mips.cc', + 'source/scale_msa.cc', 'source/scale_neon.cc', 'source/scale_neon64.cc', 'source/scale_win.cc', diff --git a/source/scale_any.cc b/source/scale_any.cc index f682b40d3..8c1aaa111 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -283,6 +283,26 @@ SDANY(ScaleARGBRowDown2Box_Any_NEON, 4, 7) #endif +#ifdef HAS_SCALEARGBROWDOWN2_MSA +SDANY(ScaleARGBRowDown2_Any_MSA, + ScaleARGBRowDown2_MSA, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_MSA, + ScaleARGBRowDown2Linear_MSA, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_MSA, + ScaleARGBRowDown2Box_MSA, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif #undef SDANY // Scale down by even scale factor. @@ -322,6 +342,18 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, 4, 3) #endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA +SDAANY(ScaleARGBRowDownEven_Any_MSA, + ScaleARGBRowDownEven_MSA, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, + ScaleARGBRowDownEvenBox_MSA, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif // Add rows box filter scale down. #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 09c6e0ae1..17e61ba70 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -92,6 +92,22 @@ static void ScaleARGBDown2(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA + : ScaleARGBRowDown2Box_Any_MSA); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_MSA + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA + : ScaleARGBRowDown2Box_MSA); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -203,6 +219,16 @@ static void ScaleARGBDownEven(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA + : ScaleARGBRowDownEven_Any_MSA; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; diff --git a/source/scale_msa.cc b/source/scale_msa.cc new file mode 100644 index 000000000..9c7679fdf --- /dev/null +++ b/source/scale_msa.cc @@ -0,0 +1,177 @@ +/* + * Copyright 2016 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale_row.h" + +// This module is for GCC MSA +#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) +#include "libyuv/macros_msa.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, dst0; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + v16u8 src0, src1, vec0, vec1, dst0; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); + vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); + dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); + ST_UB(dst0, dst_argb); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; + v8u16 reg0, reg1, reg2, reg3; + v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; + + for (x = 0; x < dst_width; x += 4) { + src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); + vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); + vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); + vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg0 += reg2; + reg1 += reg3; + reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); + reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); + ST_UB(dst0, dst_argb); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int32_t stepx = src_stepx * 4; + int32_t data0, data1, data2, data3; + + for (x = 0; x < dst_width; x += 4) { + data0 = LW(src_argb); + data1 = LW(src_argb + stepx); + data2 = LW(src_argb + stepx * 2); + data3 = LW(src_argb + stepx * 3); + SW(data0, dst_argb); + SW(data1, dst_argb + 4); + SW(data2, dst_argb + 8); + SW(data3, dst_argb + 12); + src_argb += stepx * 4; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, + int dst_width) { + int x; + const uint8* nxt_argb = src_argb + src_stride; + int32_t stepx = src_stepx * 4; + int64_t data0, data1, data2, data3; + v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; + v16u8 vec0, vec1, vec2, vec3; + v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + v16u8 dst0; + + for (x = 0; x < dst_width; x += 4) { + data0 = LD(src_argb); + data1 = LD(src_argb + stepx); + data2 = LD(src_argb + stepx * 2); + data3 = LD(src_argb + stepx * 3); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); + src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); + src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); + data0 = LD(nxt_argb); + data1 = LD(nxt_argb + stepx); + data2 = LD(nxt_argb + stepx * 2); + data3 = LD(nxt_argb + stepx * 3); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); + src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); + src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); + vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); + vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); + reg0 = __msa_hadd_u_h(vec0, vec0); + reg1 = __msa_hadd_u_h(vec1, vec1); + reg2 = __msa_hadd_u_h(vec2, vec2); + reg3 = __msa_hadd_u_h(vec3, vec3); + reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); + reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); + reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); + reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); + reg4 += reg6; + reg5 += reg7; + reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); + reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); + dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); + ST_UB(dst0, dst_argb); + src_argb += stepx * 4; + nxt_argb += stepx * 4; + dst_argb += 16; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)