diff --git a/CMakeLists.txt b/CMakeLists.txt index 7307a2d16..1b81f431a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,6 +82,14 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_library( ${ly_lib_name}_common_objects OBJECT ${ly_common_source_files} ) set(ly_lib_parts $) +string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC) +set(LOONGARCH64_ALIASES loongarch64) +list(FIND LOONGARCH64_ALIASES "${SYSPROC}" LOONGARCH64MATCH) + +if(LOONGARCH64MATCH GREATER "-1") + set(LOONGARCH64 1) +endif() + if(NOT MSVC) string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" arch_lowercase) @@ -140,6 +148,31 @@ int main(void) { return 0; } endif() endif() +if(LOONGARCH64) + include(CheckCXXSourceCompiles) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-narrowing") + check_cxx_source_compiles(" + int main(int argc, char **argv) { + __asm__ volatile ( + \"vadd.w $vr0, $vr1, $vr1\" + ); + return 0; }" SUPPORTS_LSX) + + check_cxx_source_compiles(" + int main(int argc, char **argv) { + __asm__ volatile ( + \"xvadd.w $xr0, $xr1, $xr1\" + ); + return 0; }" SUPPORTS_LASX) + + if(SUPPORTS_LSX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx") + endif() + if(SUPPORTS_LASX) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlasx") + endif() +endif() + # this creates the static library (.a) add_library( ${ly_lib_static} STATIC ${ly_lib_parts}) diff --git a/source/convert_argb.cc b/source/convert_argb.cc index dab0d9644..76c9f78f5 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2289,6 +2289,22 @@ int I420AlphaToARGBMatrix(const uint8_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2437,6 +2453,22 @@ int I422AlphaToARGBMatrix(const uint8_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2567,6 +2599,22 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2813,6 +2861,22 @@ int I010AlphaToARGBMatrix(const uint16_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -2932,6 +2996,22 @@ int I210AlphaToARGBMatrix(const uint16_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -3049,6 +3129,22 @@ int I410AlphaToARGBMatrix(const uint16_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif for (y = 0; y < height; ++y) { I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, @@ -7503,6 +7599,22 @@ static int I420AlphaToARGBMatrixBilinear( ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif #if defined(HAS_SCALEROWUP2_BILINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -7718,6 +7830,22 @@ static int I422AlphaToARGBMatrixLinear(const uint8_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif #if defined(HAS_SCALEROWUP2_LINEAR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { @@ -7877,6 +8005,22 @@ static int I010AlphaToARGBMatrixBilinear( ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif #if defined(HAS_SCALEROWUP2_BILINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -8058,6 +8202,22 @@ static int I210AlphaToARGBMatrixLinear(const uint16_t* src_y, ARGBAttenuateRow = ARGBAttenuateRow_RVV; } #endif +#if defined(HAS_ARGBATTENUATEROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LSX; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_LSX; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_LASX) + if (TestCpuFlag(kCpuHasLASX)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_LASX; + if (IS_ALIGNED(width, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_LASX; + } + } +#endif #if defined(HAS_SCALEROWUP2_LINEAR_12_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 6d49aa5e8..734d7ee29 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -1148,24 +1148,26 @@ void ARGBAttenuateRow_LASX(const uint8_t* src_argb, __m256i b, g, r, a, dst0, dst1; __m256i control = {0x0005000100040000, 0x0007000300060002, 0x0005000100040000, 0x0007000300060002}; + __m256i zero = __lasx_xvldi(0); + __m256i const_add = __lasx_xvldi(0x8ff); for (x = 0; x < len; x++) { DUP2_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src0, src1); tmp0 = __lasx_xvpickev_b(src1, src0); tmp1 = __lasx_xvpickod_b(src1, src0); - b = __lasx_xvpackev_b(tmp0, tmp0); - r = __lasx_xvpackod_b(tmp0, tmp0); - g = __lasx_xvpackev_b(tmp1, tmp1); - a = __lasx_xvpackod_b(tmp1, tmp1); - reg0 = __lasx_xvmulwev_w_hu(b, a); - reg1 = __lasx_xvmulwod_w_hu(b, a); - reg2 = __lasx_xvmulwev_w_hu(r, a); - reg3 = __lasx_xvmulwod_w_hu(r, a); - reg4 = __lasx_xvmulwev_w_hu(g, a); - reg5 = __lasx_xvmulwod_w_hu(g, a); - reg0 = __lasx_xvssrani_h_w(reg1, reg0, 24); - reg2 = __lasx_xvssrani_h_w(reg3, reg2, 24); - reg4 = __lasx_xvssrani_h_w(reg5, reg4, 24); + b = __lasx_xvpackev_b(zero, tmp0); + r = __lasx_xvpackod_b(zero, tmp0); + g = __lasx_xvpackev_b(zero, tmp1); + a = __lasx_xvpackod_b(zero, tmp1); + reg0 = __lasx_xvmaddwev_w_hu(const_add, b, a); + reg1 = __lasx_xvmaddwod_w_hu(const_add, b, a); + reg2 = __lasx_xvmaddwev_w_hu(const_add, r, a); + reg3 = __lasx_xvmaddwod_w_hu(const_add, r, a); + reg4 = __lasx_xvmaddwev_w_hu(const_add, g, a); + reg5 = __lasx_xvmaddwod_w_hu(const_add, g, a); + reg0 = __lasx_xvssrani_h_w(reg1, reg0, 8); + reg2 = __lasx_xvssrani_h_w(reg3, reg2, 8); + reg4 = __lasx_xvssrani_h_w(reg5, reg4, 8); reg0 = __lasx_xvshuf_h(control, reg0, reg0); reg2 = __lasx_xvshuf_h(control, reg2, reg2); reg4 = __lasx_xvshuf_h(control, reg4, reg4); diff --git a/source/row_lsx.cc b/source/row_lsx.cc index ee74cad9f..50d5ba6a0 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -1102,24 +1102,26 @@ void ARGBAttenuateRow_LSX(const uint8_t* src_argb, __m128i reg0, reg1, reg2, reg3, reg4, reg5; __m128i b, g, r, a, dst0, dst1; __m128i control = {0x0005000100040000, 0x0007000300060002}; + __m128i zero = __lsx_vldi(0); + __m128i const_add = __lsx_vldi(0x8ff); for (x = 0; x < len; x++) { DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); tmp0 = __lsx_vpickev_b(src1, src0); tmp1 = __lsx_vpickod_b(src1, src0); - b = __lsx_vpackev_b(tmp0, tmp0); - r = __lsx_vpackod_b(tmp0, tmp0); - g = __lsx_vpackev_b(tmp1, tmp1); - a = __lsx_vpackod_b(tmp1, tmp1); - reg0 = __lsx_vmulwev_w_hu(b, a); - reg1 = __lsx_vmulwod_w_hu(b, a); - reg2 = __lsx_vmulwev_w_hu(r, a); - reg3 = __lsx_vmulwod_w_hu(r, a); - reg4 = __lsx_vmulwev_w_hu(g, a); - reg5 = __lsx_vmulwod_w_hu(g, a); - reg0 = __lsx_vssrani_h_w(reg1, reg0, 24); - reg2 = __lsx_vssrani_h_w(reg3, reg2, 24); - reg4 = __lsx_vssrani_h_w(reg5, reg4, 24); + b = __lsx_vpackev_b(zero, tmp0); + r = __lsx_vpackod_b(zero, tmp0); + g = __lsx_vpackev_b(zero, tmp1); + a = __lsx_vpackod_b(zero, tmp1); + reg0 = __lsx_vmaddwev_w_hu(const_add, b, a); + reg1 = __lsx_vmaddwod_w_hu(const_add, b, a); + reg2 = __lsx_vmaddwev_w_hu(const_add, r, a); + reg3 = __lsx_vmaddwod_w_hu(const_add, r, a); + reg4 = __lsx_vmaddwev_w_hu(const_add, g, a); + reg5 = __lsx_vmaddwod_w_hu(const_add, g, a); + reg0 = __lsx_vssrani_h_w(reg1, reg0, 8); + reg2 = __lsx_vssrani_h_w(reg3, reg2, 8); + reg4 = __lsx_vssrani_h_w(reg5, reg4, 8); reg0 = __lsx_vshuf_h(control, reg0, reg0); reg2 = __lsx_vshuf_h(control, reg2, reg2); reg4 = __lsx_vshuf_h(control, reg4, reg4);