From 2f87e9a7135b83656c86b3b23ba582e0dfeb7fbb Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Mon, 20 Dec 2021 20:23:28 +0800 Subject: [PATCH] Add optimization functions in scale_lsx.cc file. Optimize 20 functions in source/scale_lsx.cc file. All test cases passed on loongarch platform. Bug: libyuv:913 Change-Id: I85bcb3b0bfd9461bb6f93202546507352cbd624a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3351469 Reviewed-by: Frank Barchard --- include/libyuv/scale_row.h | 180 +++++++++ source/rotate_argb.cc | 8 + source/scale.cc | 88 +++++ source/scale_any.cc | 115 ++++++ source/scale_argb.cc | 74 ++++ source/scale_lsx.cc | 738 +++++++++++++++++++++++++++++++++++++ 6 files changed, 1203 insertions(+) create mode 100644 source/scale_lsx.cc diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 461ac36f3..249f20967 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -178,6 +178,19 @@ extern "C" { #define HAS_SCALEROWDOWN34_MMI #endif +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#define HAS_SCALEARGBROWDOWN2_LSX +#define HAS_SCALEARGBROWDOWNEVEN_LSX +#define HAS_SCALEROWDOWN2_LSX +#define HAS_SCALEROWDOWN4_LSX +#define HAS_SCALEROWDOWN38_LSX +#define HAS_SCALEFILTERCOLS_LSX +#define HAS_SCALEADDROW_LSX +#define HAS_SCALEARGBCOLS_LSX +#define HAS_SCALEARGBFILTERCOLS_LSX +#define HAS_SCALEROWDOWN34_LSX +#endif + // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, @@ -931,6 +944,18 @@ void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); +void ScaleARGBRowDown2_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -979,6 +1004,18 @@ void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); +void ScaleARGBRowDown2_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Linear_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDown2Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -1019,6 +1056,16 @@ void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width); +void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width); +void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width); void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, @@ -1059,6 +1106,16 @@ void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, int src_stepx, uint8_t* dst_ptr, int dst_width); +void ScaleARGBRowDownEven_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_ptr, + int dst_width); +void ScaleARGBRowDownEvenBox_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_ptr, + int dst_width); // UV Row functions void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, @@ -1718,6 +1775,129 @@ void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr, void ScaleAddRow_Any_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleRowDown2_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown2Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown4Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); +void ScaleFilterCols_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleARGBCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx); +void ScaleRowDown34_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width); +void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width); +void ScaleRowDown2_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Linear_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown2Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown4Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_2_Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown38_3_Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleAddRow_Any_LSX(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int src_width); +void ScaleFilterCols_Any_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBCols_Any_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleARGBFilterCols_Any_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx); +void ScaleRowDown34_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_0_Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); +void ScaleRowDown34_1_Box_Any_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 4d36a910c..7bcc65d44 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -68,6 +68,14 @@ static int ARGBTranspose(const uint8_t* src_argb, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_LSX; + if (IS_ALIGNED(height, 4)) { // Width of dest. + ScaleARGBRowDownEven = ScaleARGBRowDownEven_LSX; + } + } +#endif for (i = 0; i < width; ++i) { // column of source to row of dest. ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); diff --git a/source/scale.cc b/source/scale.cc index ebb8a283c..657d71513 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -134,6 +134,21 @@ static void ScalePlaneDown2(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleRowDown2 = + filtering == kFilterNone + ? ScaleRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_LSX + : ScaleRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_LSX + : (filtering == kFilterLinear + ? ScaleRowDown2Linear_LSX + : ScaleRowDown2Box_LSX); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -274,6 +289,15 @@ static void ScalePlaneDown4(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN4_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleRowDown4 = + filtering ? ScaleRowDown4Box_Any_LSX : ScaleRowDown4_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_LSX : ScaleRowDown4_LSX; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -411,6 +435,26 @@ static void ScalePlaneDown34(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN34_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_Any_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_LSX; + } + if (dst_width % 48 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_LSX; + ScaleRowDown34_1 = ScaleRowDown34_LSX; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_LSX; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_LSX; + } + } + } +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { @@ -626,6 +670,26 @@ static void ScalePlaneDown38(int src_width, } } #endif +#if defined(HAS_SCALEROWDOWN38_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_Any_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_LSX; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_LSX; + ScaleRowDown38_2 = ScaleRowDown38_LSX; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_LSX; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_LSX; + } + } + } +#endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); @@ -907,6 +971,14 @@ static void ScalePlaneBox(int src_width, } } #endif +#if defined(HAS_SCALEADDROW_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleAddRow = ScaleAddRow_Any_LSX; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_LSX; + } + } +#endif for (j = 0; j < dst_height; ++j) { int boxheight; @@ -1088,6 +1160,14 @@ void ScalePlaneBilinearDown(int src_width, ScaleFilterCols = ScaleFilterCols_MSA; } } +#endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } #endif if (y > max_y) { y = max_y; @@ -1278,6 +1358,14 @@ void ScalePlaneBilinearUp(int src_width, ScaleFilterCols = ScaleFilterCols_MSA; } } +#endif +#if defined(HAS_SCALEFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 16)) { + ScaleFilterCols = ScaleFilterCols_LSX; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; diff --git a/source/scale_any.cc b/source/scale_any.cc index f1d89abb3..9acd6cc87 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -173,6 +173,21 @@ SDODD(ScaleRowDown2Box_Odd_MMI, 1, 7) #endif +#ifdef HAS_SCALEROWDOWN2_LSX +SDANY(ScaleRowDown2_Any_LSX, ScaleRowDown2_LSX, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_LSX, + ScaleRowDown2Linear_LSX, + ScaleRowDown2Linear_C, + 2, + 1, + 31) +SDANY(ScaleRowDown2Box_Any_LSX, + ScaleRowDown2Box_LSX, + ScaleRowDown2Box_C, + 2, + 1, + 31) +#endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSSE3, @@ -218,6 +233,15 @@ SDANY(ScaleRowDown4Box_Any_MMI, 1, 7) #endif +#ifdef HAS_SCALEROWDOWN4_LSX +SDANY(ScaleRowDown4_Any_LSX, ScaleRowDown4_LSX, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_LSX, + ScaleRowDown4Box_LSX, + ScaleRowDown4Box_C, + 4, + 1, + 15) +#endif #ifdef HAS_SCALEROWDOWN34_SSSE3 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, @@ -278,6 +302,26 @@ SDANY(ScaleRowDown34_1_Box_Any_MSA, 1, 47) #endif +#ifdef HAS_SCALEROWDOWN34_LSX +SDANY(ScaleRowDown34_Any_LSX, + ScaleRowDown34_LSX, + ScaleRowDown34_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_0_Box_Any_LSX, + ScaleRowDown34_0_Box_LSX, + ScaleRowDown34_0_Box_C, + 4 / 3, + 1, + 47) +SDANY(ScaleRowDown34_1_Box_Any_LSX, + ScaleRowDown34_1_Box_LSX, + ScaleRowDown34_1_Box_C, + 4 / 3, + 1, + 47) +#endif #ifdef HAS_SCALEROWDOWN34_MMI SDANY(ScaleRowDown34_Any_MMI, ScaleRowDown34_MMI, @@ -346,6 +390,26 @@ SDANY(ScaleRowDown38_2_Box_Any_MSA, 1, 11) #endif +#ifdef HAS_SCALEROWDOWN38_LSX +SDANY(ScaleRowDown38_Any_LSX, + ScaleRowDown38_LSX, + ScaleRowDown38_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_3_Box_Any_LSX, + ScaleRowDown38_3_Box_LSX, + ScaleRowDown38_3_Box_C, + 8 / 3, + 1, + 11) +SDANY(ScaleRowDown38_2_Box_Any_LSX, + ScaleRowDown38_2_Box_LSX, + ScaleRowDown38_2_Box_C, + 8 / 3, + 1, + 11) +#endif #ifdef HAS_SCALEARGBROWDOWN2_SSE2 SDANY(ScaleARGBRowDown2_Any_SSE2, @@ -427,6 +491,26 @@ SDANY(ScaleARGBRowDown2Box_Any_MMI, 4, 1) #endif +#ifdef HAS_SCALEARGBROWDOWN2_LSX +SDANY(ScaleARGBRowDown2_Any_LSX, + ScaleARGBRowDown2_LSX, + ScaleARGBRowDown2_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Linear_Any_LSX, + ScaleARGBRowDown2Linear_LSX, + ScaleARGBRowDown2Linear_C, + 2, + 4, + 3) +SDANY(ScaleARGBRowDown2Box_Any_LSX, + ScaleARGBRowDown2Box_LSX, + ScaleARGBRowDown2Box_C, + 2, + 4, + 3) +#endif #undef SDANY // Scale down by even scale factor. @@ -490,6 +574,18 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, 4, 1) #endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_LSX +SDAANY(ScaleARGBRowDownEven_Any_LSX, + ScaleARGBRowDownEven_LSX, + ScaleARGBRowDownEven_C, + 4, + 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_LSX, + ScaleARGBRowDownEvenBox_LSX, + ScaleARGBRowDownEvenBox_C, + 4, + 3) +#endif #ifdef HAS_SCALEUVROWDOWNEVEN_NEON SDAANY(ScaleUVRowDownEven_Any_NEON, ScaleUVRowDownEven_NEON, @@ -530,6 +626,9 @@ SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) #ifdef HAS_SCALEADDROW_MSA SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) #endif +#ifdef HAS_SCALEADDROW_LSX +SAROW(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, 1, 2, 15) +#endif #ifdef HAS_SCALEADDROW_MMI SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7) #endif @@ -562,6 +661,9 @@ SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) #ifdef HAS_SCALEADDROW_MMI SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) #endif +#ifdef HAS_SCALEADDROW_LSX +SAANY(ScaleAddRow_Any_LSX, ScaleAddRow_LSX, ScaleAddRow_C, 15) +#endif #undef SAANY #endif // SASIMDONLY @@ -584,12 +686,18 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #ifdef HAS_SCALEFILTERCOLS_MSA CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) #endif +#ifdef HAS_SCALEFILTERCOLS_LSX +CANY(ScaleFilterCols_Any_LSX, ScaleFilterCols_LSX, ScaleFilterCols_C, 1, 15) +#endif #ifdef HAS_SCALEARGBCOLS_NEON CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #endif #ifdef HAS_SCALEARGBCOLS_MSA CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) #endif +#ifdef HAS_SCALEARGBCOLS_LSX +CANY(ScaleARGBCols_Any_LSX, ScaleARGBCols_LSX, ScaleARGBCols_C, 4, 3) +#endif #ifdef HAS_SCALEARGBCOLS_MMI CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) #endif @@ -607,6 +715,13 @@ CANY(ScaleARGBFilterCols_Any_MSA, 4, 7) #endif +#ifdef HAS_SCALEARGBFILTERCOLS_LSX +CANY(ScaleARGBFilterCols_Any_LSX, + ScaleARGBFilterCols_LSX, + ScaleARGBFilterCols_C, + 4, + 7) +#endif #undef CANY // Scale up horizontally 2 times using linear filter. diff --git a/source/scale_argb.cc b/source/scale_argb.cc index a5d5ee9c5..d20ec8c54 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -127,6 +127,22 @@ static void ScaleARGBDown2(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWN2_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_Any_LSX + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_LSX + : ScaleARGBRowDown2Box_Any_LSX); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = + filtering == kFilterNone + ? ScaleARGBRowDown2_LSX + : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_LSX + : ScaleARGBRowDown2Box_LSX); + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -263,6 +279,16 @@ static void ScaleARGBDownEven(int src_width, } } #endif +#if defined(HAS_SCALEARGBROWDOWNEVEN_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_LSX + : ScaleARGBRowDownEven_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = + filtering ? ScaleARGBRowDownEvenBox_LSX : ScaleARGBRowDownEven_LSX; + } + } +#endif if (filtering == kFilterLinear) { src_stride = 0; @@ -368,6 +394,14 @@ static void ScaleARGBBilinearDown(int src_width, ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; } } +#endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } #endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. @@ -493,6 +527,14 @@ static void ScaleARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -521,6 +563,14 @@ static void ScaleARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBCols_MSA; } } +#endif +#if defined(HAS_SCALEARGBCOLS_LSX) + if (!filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_LSX; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -740,6 +790,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, } } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_LSX) + if (filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_LSX; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_LSX; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; @@ -768,6 +826,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, ScaleARGBFilterCols = ScaleARGBCols_MSA; } } +#endif +#if defined(HAS_SCALEARGBCOLS_LSX) + if (!filtering && TestCpuFlag(kCpuHasLSX)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBCols_LSX; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; @@ -912,6 +978,14 @@ static void ScaleARGBSimple(int src_width, ScaleARGBCols = ScaleARGBCols_MSA; } } +#endif +#if defined(HAS_SCALEARGBCOLS_LSX) + if (TestCpuFlag(kCpuHasLSX)) { + ScaleARGBCols = ScaleARGBCols_Any_LSX; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBCols = ScaleARGBCols_LSX; + } + } #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; diff --git a/source/scale_lsx.cc b/source/scale_lsx.cc new file mode 100644 index 000000000..d8181b3e7 --- /dev/null +++ b/source/scale_lsx.cc @@ -0,0 +1,738 @@ +/* + * Copyright 2022 The LibYuv Project Authors. All rights reserved. + * + * Copyright (c) 2022 Loongson Technology Corporation Limited + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "libyuv/scale_row.h" + +#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx) +#include "libyuv/loongson_intrinsics.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define LOAD_DATA(_src, _in, _out) \ + { \ + int _tmp1, _tmp2, _tmp3, _tmp4; \ + DUP4_ARG2(__lsx_vpickve2gr_w, _in, 0, _in, 1, _in, 2, \ + _in, 3, _tmp1, _tmp2, _tmp3, _tmp4); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp1], 0); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp2], 1); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp3], 2); \ + _out = __lsx_vinsgr2vr_w(_out, _src[_tmp4], 3); \ + } + +void ScaleARGBRowDown2_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + (void)src_stride; + __m128i src0, src1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + dst0 = __lsx_vpickod_w(src1, src0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Linear_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + (void)src_stride; + __m128i src0, src1, tmp0, tmp1, dst0; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_argb, 0, src_argb, 16, src0, src1); + tmp0 = __lsx_vpickev_w(src1, src0); + tmp1 = __lsx_vpickod_w(src1, src0); + dst0 = __lsx_vavgr_bu(tmp1, tmp0); + __lsx_vst(dst0, dst_argb, 0); + src_argb += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDown2Box_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + const uint8_t* s = src_argb; + const uint8_t* t = src_argb + src_stride; + __m128i src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3, dst0; + __m128i reg0, reg1, reg2, reg3; + __m128i shuff = {0x0703060205010400, 0x0F0B0E0A0D090C08}; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, s, 0, s, 16, src0, src1); + DUP2_ARG2(__lsx_vld, t, 0, t, 16, src2, src3); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff, src1, src1, shuff, src2, src2, + shuff, src3, src3, shuff, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vhaddw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, + tmp3, reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vsadd_hu, reg0, reg2, reg1, reg3, reg0, reg1); + dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); + __lsx_vst(dst0, dst_argb, 0); + s += 32; + t += 32; + dst_argb += 16; + } +} + +void ScaleARGBRowDownEven_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int32_t src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + int32_t stepx = src_stepx << 2; + (void)src_stride; + __m128i dst0, dst1, dst2, dst3; + + for (x = 0; x < len; x++) { + dst0 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst1 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst2 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + dst3 = __lsx_vldrepl_w(src_argb, 0); + src_argb += stepx; + __lsx_vstelm_w(dst0, dst_argb, 0, 0); + __lsx_vstelm_w(dst1, dst_argb, 4, 0); + __lsx_vstelm_w(dst2, dst_argb, 8, 0); + __lsx_vstelm_w(dst3, dst_argb, 12, 0); + dst_argb += 16; + } +} + +void ScaleARGBRowDownEvenBox_LSX(const uint8_t* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8_t* dst_argb, + int dst_width) { + int x; + int len = dst_width / 4; + int32_t stepx = src_stepx * 4; + const uint8_t* next_argb = src_argb + src_stride; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, dst0; + + for (x = 0; x < len; x++) { + tmp0 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp1 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp2 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp3 = __lsx_vldrepl_d(src_argb, 0); + src_argb += stepx; + tmp4 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp5 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp6 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + tmp7 = __lsx_vldrepl_d(next_argb, 0); + next_argb += stepx; + DUP4_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, + tmp7, tmp6, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackev_w, tmp1, tmp0, tmp3, tmp2, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_w, tmp1, tmp0, tmp3, tmp2, tmp4, tmp5); + DUP2_ARG2(__lsx_vadd_h, reg0, tmp4, reg1, tmp5, reg0, reg1); + dst0 = __lsx_vsrarni_b_h(reg1, reg0, 2); + dst0 = __lsx_vshuf4i_b(dst0, 0xD8); + __lsx_vst(dst0, dst_argb, 0); + dst_argb += 16; + } +} + +void ScaleRowDown2_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + __m128i src0, src1, src2, src3, dst0, dst1; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, + 48, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Linear_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, dst0, dst1; + (void)src_stride; + + for(x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, + 48, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp2); + DUP2_ARG2(__lsx_vpickod_b, src1, src0, src3, src2, tmp1, tmp3); + DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp1, tmp2, tmp3, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + dst += 32; + } +} + +void ScaleRowDown2Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 32; + const uint8_t *src_nex = src_ptr + src_stride; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i dst0, dst1; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, + 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, + 48, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, src7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, src7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vsrarni_b_h, tmp1, tmp0, 2, tmp3, tmp2, 2, dst0, dst1); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + src_ptr += 64; + src_nex += 64; + dst += 32; + } +} + +void ScaleRowDown4_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 16; + __m128i src0, src1, src2, src3, tmp0, tmp1, dst0; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, + 48, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vpickev_b, src1, src0, src3, src2, tmp0, tmp1); + dst0 = __lsx_vpickod_b(tmp1, tmp0); + __lsx_vst(dst0, dst, 0); + src_ptr += 64; + dst += 16; + } +} + +void ScaleRowDown4Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + int len = dst_width / 16; + const uint8_t* ptr1 = src_ptr + src_stride; + const uint8_t* ptr2 = ptr1 + src_stride; + const uint8_t* ptr3 = ptr2 + src_stride; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, dst0; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, + 48, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, ptr1, 0, ptr1, 16, ptr1, 32, ptr1, 48, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, src7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, src7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, ptr2, 32, ptr2, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, ptr3, 0, ptr3, 16, ptr3, 32, ptr3, 48, + src4, src5, src6, src7); + DUP4_ARG2(__lsx_vaddwev_h_bu, src0, src4, src1, src5, src2, src6, src3, src7, + tmp0, tmp2, tmp4, tmp6); + DUP4_ARG2(__lsx_vaddwod_h_bu, src0, src4, src1, src5, src2, src6, src3, src7, + tmp1, tmp3, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, + reg0, reg1, reg2, reg3); + DUP4_ARG2(__lsx_vhaddw_wu_hu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3, + reg0, reg1, reg2, reg3); + DUP2_ARG3(__lsx_vsrarni_h_w, reg1, reg0, 4, reg3, reg2, 4, tmp0, tmp1); + dst0 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(dst0, dst, 0); + src_ptr += 64; + ptr1 += 64; + ptr2 += 64; + ptr3 += 64; + dst += 16; + } +} + +void ScaleRowDown38_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x, len; + __m128i src0, src1, tmp0; + __m128i shuff = {0x13100E0B08060300, 0x000000001E1B1816}; + + assert(dst_width % 3 == 0); + len = dst_width / 12; + (void)src_stride; + + for (x = 0; x < len; x++) { + DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1); + tmp0 = __lsx_vshuf_b(src1, src0, shuff); + __lsx_vstelm_d(tmp0, dst, 0, 0); + __lsx_vstelm_w(tmp0, dst, 8, 2); + src_ptr += 32; + dst += 12; + } +} + +void ScaleRowDown38_2_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, len; + const uint8_t* src_nex = src_ptr + src_stride; + __m128i src0, src1, src2, src3, dst0; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3; + __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; + __m128i const_0x2AAA = __lsx_vreplgr2vr_h(0x2AAA); + __m128i const_0x4000 = __lsx_vreplgr2vr_w(0x4000); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + len = dst_width / 12; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_nex, 0, src_nex, + 16, src0, src1, src2, src3); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); + tmp4 = __lsx_vpickev_w(reg3, reg2); + tmp5 = __lsx_vadd_h(reg0, reg1); + tmp6 = __lsx_vadd_h(tmp5, tmp4); + tmp7 = __lsx_vmuh_h(tmp6, const_0x2AAA); + tmp0 = __lsx_vpickod_w(reg3, reg2); + tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); + tmp2 = __lsx_vmul_w(tmp1, const_0x4000); + dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); + __lsx_vstelm_d(dst0, dst_ptr, 0, 0); + __lsx_vstelm_w(dst0, dst_ptr, 8, 2); + src_ptr += 32; + src_nex += 32; + dst_ptr += 12; + } +} + +void ScaleRowDown38_3_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + int dst_width) { + int x, len; + const uint8_t* ptr1 = src_ptr + src_stride; + const uint8_t* ptr2 = ptr1 + src_stride; + __m128i src0, src1, src2, src3, src4, src5; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, dst0; + __m128i zero = __lsx_vldi(0); + __m128i shuff = {0x0A08160604120200, 0x000000001E0E0C1A}; + __m128i const_0x1C71 = __lsx_vreplgr2vr_h(0x1C71); + __m128i const_0x2AAA = __lsx_vreplgr2vr_w(0x2AAA); + + assert((dst_width % 3 == 0) && (dst_width > 0)); + len = dst_width / 12; + + for (x = 0; x < len; x++) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, ptr1, 0, ptr1, 16, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vld, ptr2, 0, ptr2, 16, src4, src5); + DUP2_ARG2(__lsx_vaddwev_h_bu, src0, src2, src1, src3, tmp0, tmp2); + DUP2_ARG2(__lsx_vaddwod_h_bu, src0, src2, src1, src3, tmp1, tmp3); + DUP2_ARG2(__lsx_vpackev_b, zero, src4, zero, src5, tmp4, tmp6); + DUP2_ARG2(__lsx_vpackod_b, zero, src4, zero, src5, tmp5, tmp7); + DUP4_ARG2(__lsx_vadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, + tmp7, tmp0, tmp1, tmp2, tmp3); + DUP2_ARG2(__lsx_vpickev_h, tmp2, tmp0, tmp3, tmp1, reg0, reg1); + DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, reg2, reg3); + tmp4 = __lsx_vpickev_w(reg3, reg2); + tmp5 = __lsx_vadd_h(reg0, reg1); + tmp6 = __lsx_vadd_h(tmp5, tmp4); + tmp7 = __lsx_vmuh_h(tmp6, const_0x1C71); + tmp0 = __lsx_vpickod_w(reg3, reg2); + tmp1 = __lsx_vhaddw_wu_hu(tmp0, tmp0); + tmp2 = __lsx_vmul_w(tmp1, const_0x2AAA); + dst0 = __lsx_vshuf_b(tmp2, tmp7, shuff); + __lsx_vstelm_d(dst0, dst_ptr, 0, 0); + __lsx_vstelm_w(dst0, dst_ptr, 8, 2); + src_ptr += 32; + ptr1 += 32; + ptr2 += 32; + dst_ptr += 12; + } +} + +void ScaleAddRow_LSX(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { + int x; + int len = src_width / 16; + __m128i src0, tmp0, tmp1, dst0, dst1; + __m128i zero = __lsx_vldi(0); + + assert(src_width > 0); + + for (x = 0; x < len; x++) { + src0 = __lsx_vld(src_ptr, 0); + DUP2_ARG2(__lsx_vld, dst_ptr, 0, dst_ptr, 16, dst0, dst1); + tmp0 = __lsx_vilvl_b(zero, src0); + tmp1 = __lsx_vilvh_b(zero, src0); + DUP2_ARG2(__lsx_vadd_h, dst0, tmp0, dst1, tmp1, dst0, dst1); + __lsx_vst(dst0, dst_ptr, 0); + __lsx_vst(dst1, dst_ptr, 16); + src_ptr += 16; + dst_ptr += 16; + } +} + +void ScaleFilterCols_LSX(uint8_t* dst_ptr, + const uint8_t* src_ptr, + int dst_width, + int x, + int dx) { + int j; + int len = dst_width / 16; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec0, vec1, dst0; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const1 = __lsx_vreplgr2vr_w(0xFFFF); + __m128i const2 = __lsx_vreplgr2vr_w(0x40); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + + vec0 = __lsx_vmul_w(vec_dx, const_tmp); + vec1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, vec0); + + for (j = 0; j < len; j++) { + tmp0 = __lsx_vsrai_w(vec_x, 16); + tmp4 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp1 = __lsx_vsrai_w(vec_x, 16); + tmp5 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp2 = __lsx_vsrai_w(vec_x, 16); + tmp6 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp3 = __lsx_vsrai_w(vec_x, 16); + tmp7 = __lsx_vand_v(vec_x, const1); + vec_x = __lsx_vadd_w(vec_x, vec1); + DUP4_ARG2(__lsx_vsrai_w, tmp4, 9, tmp5, 9, tmp6, 9, tmp7, 9, + tmp4, tmp5, tmp6, tmp7); + LOAD_DATA(src_ptr, tmp0, reg0); + LOAD_DATA(src_ptr, tmp1, reg1); + LOAD_DATA(src_ptr, tmp2, reg2); + LOAD_DATA(src_ptr, tmp3, reg3); + DUP4_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp2, 1, tmp3, 1, + tmp0, tmp1, tmp2, tmp3); + LOAD_DATA(src_ptr, tmp0, reg4); + LOAD_DATA(src_ptr, tmp1, reg5); + LOAD_DATA(src_ptr, tmp2, reg6); + LOAD_DATA(src_ptr, tmp3, reg7); + DUP4_ARG2(__lsx_vsub_w, reg4, reg0, reg5, reg1, reg6, reg2, reg7, + reg3, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vmul_w, reg4, tmp4, reg5, tmp5, reg6, tmp6, reg7, + tmp7, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_w, reg4, const2, reg5, const2, reg6, const2, + reg7, const2, reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vsrai_w, reg4, 7, reg5, 7, reg6, 7, reg7, 7, + reg4, reg5, reg6, reg7); + DUP4_ARG2(__lsx_vadd_w, reg0, reg4, reg1, reg5, reg2, reg6, reg3, + reg7, reg0, reg1, reg2, reg3); + DUP2_ARG2(__lsx_vpickev_h, reg1, reg0, reg3, reg2, tmp0, tmp1); + dst0 = __lsx_vpickev_b(tmp1, tmp0); + __lsx_vst(dst0, dst_ptr, 0); + dst_ptr += 16; + } +} + +void ScaleARGBCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)src_argb; + uint32_t* dst = (uint32_t*)dst_argb; + int j; + int len = dst_width / 4; + __m128i tmp0, tmp1, tmp2, dst0; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + + tmp0 = __lsx_vmul_w(vec_dx, const_tmp); + tmp1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, tmp0); + + for (j = 0; j < len; j++) { + tmp2 = __lsx_vsrai_w(vec_x, 16); + vec_x = __lsx_vadd_w(vec_x, tmp1); + LOAD_DATA(src, tmp2, dst0); + __lsx_vst(dst0, dst, 0); + dst += 4; + } +} + +void ScaleARGBFilterCols_LSX(uint8_t* dst_argb, + const uint8_t* src_argb, + int dst_width, + int x, + int dx) { + const uint32_t* src = (const uint32_t*)src_argb; + int j; + int len = dst_width / 8; + __m128i src0, src1, src2, src3; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; + __m128i vec0, vec1, dst0, dst1; + __m128i vec_x = __lsx_vreplgr2vr_w(x); + __m128i vec_dx = __lsx_vreplgr2vr_w(dx); + __m128i const_tmp = {0x0000000100000000, 0x0000000300000002}; + __m128i const_7f = __lsx_vldi(0x7F); + + vec0 = __lsx_vmul_w(vec_dx, const_tmp); + vec1 = __lsx_vslli_w(vec_dx, 2); + vec_x = __lsx_vadd_w(vec_x, vec0); + + for (j = 0; j < len; j++) { + tmp0 = __lsx_vsrai_w(vec_x, 16); + reg0 = __lsx_vsrai_w(vec_x, 9); + vec_x = __lsx_vadd_w(vec_x, vec1); + tmp1 = __lsx_vsrai_w(vec_x, 16); + reg1 = __lsx_vsrai_w(vec_x, 9); + vec_x = __lsx_vadd_w(vec_x, vec1); + DUP2_ARG2(__lsx_vand_v, reg0, const_7f, reg1, const_7f, reg0, reg1); + DUP2_ARG2(__lsx_vshuf4i_b, reg0, 0, reg1, 0, reg0, reg1); + DUP2_ARG2(__lsx_vxor_v, reg0, const_7f, reg1, const_7f, reg2, reg3); + DUP2_ARG2(__lsx_vilvl_b, reg0, reg2, reg1, reg3, reg4, reg6); + DUP2_ARG2(__lsx_vilvh_b, reg0, reg2, reg1, reg3, reg5, reg7); + LOAD_DATA(src, tmp0, src0); + LOAD_DATA(src, tmp1, src1); + DUP2_ARG2(__lsx_vaddi_wu, tmp0, 1, tmp1, 1, tmp0, tmp1); + LOAD_DATA(src, tmp0, src2); + LOAD_DATA(src, tmp1, src3); + DUP2_ARG2(__lsx_vilvl_b, src2, src0, src3, src1, tmp4, tmp6); + DUP2_ARG2(__lsx_vilvh_b, src2, src0, src3, src1, tmp5, tmp7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, reg4, tmp5, reg5, tmp6, reg6, tmp7, reg7, + tmp0, tmp1, tmp2, tmp3); + DUP2_ARG3(__lsx_vsrani_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst0, dst1); + __lsx_vst(dst0, dst_argb, 0); + __lsx_vst(dst1, dst_argb, 16); + dst_argb += 32; + } +} + +void ScaleRowDown34_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst, + int dst_width) { + int x; + (void)src_stride; + __m128i src0, src1, src2, src3; + __m128i dst0, dst1, dst2; + __m128i shuff0 = {0x0908070504030100, 0x141311100F0D0C0B}; + __m128i shuff1 = {0x0F0D0C0B09080705, 0x1918171514131110}; + __m128i shuff2 = {0x141311100F0D0C0B, 0x1F1D1C1B19181715}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP2_ARG3(__lsx_vshuf_b, src1, src0, shuff0, src2, src1, shuff1, dst0, dst1); + dst2 = __lsx_vshuf_b(src3, src2, shuff2); + __lsx_vst(dst0, dst, 0); + __lsx_vst(dst1, dst, 16); + __lsx_vst(dst2, dst, 32); + src_ptr += 64; + dst += 48; + } +} + +void ScaleRowDown34_0_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* src_nex = src_ptr + src_stride; + int x; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i tmp10, tmp11, dst0, dst1, dst2; + __m128i const0 = {0x0103030101010103, 0x0101010303010101}; + __m128i const1 = {0x0301010101030301, 0x0103030101010103}; + __m128i const2 = {0x0101010303010101, 0x0301010101030301}; + __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; + __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; + __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; + __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; + __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; + __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, + shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, + shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, + shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, + const0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, + const1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, + tmp11, const2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, + shift0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, + shift1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, + shift2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vslli_h, src0, 1, src1, 1, src2, 1, src3, 1, + tmp5, tmp6, tmp7, tmp8); + DUP2_ARG2(__lsx_vslli_h, src4, 1, src5, 1, tmp9, tmp10); + DUP4_ARG2(__lsx_vadd_h, src0, tmp5, src1, tmp6, src2, tmp7, src3, tmp8, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp9, src5, tmp10, src4, src5); + DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); + DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 2, src3, src2, 2, dst0, dst1); + dst2 = __lsx_vsrarni_b_h(src5, src4, 2); + __lsx_vst(dst0, d, 0); + __lsx_vst(dst1, d, 16); + __lsx_vst(dst2, d, 32); + src_ptr += 64; + src_nex += 64; + d += 48; + } +} + +void ScaleRowDown34_1_Box_LSX(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* d, + int dst_width) { + const uint8_t* src_nex = src_ptr + src_stride; + int x; + __m128i src0, src1, src2, src3, src4, src5, src6, src7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; + __m128i tmp10, tmp11, dst0, dst1, dst2; + __m128i const0 = {0x0103030101010103, 0x0101010303010101}; + __m128i const1 = {0x0301010101030301, 0x0103030101010103}; + __m128i const2 = {0x0101010303010101, 0x0301010101030301}; + __m128i shuff0 = {0x0504030202010100, 0x0A09090807060605}; + __m128i shuff1 = {0x0F0E0E0D0D0C0B0A, 0x1514131212111110}; + __m128i shuff2 = {0x0A09090807060605, 0x0F0E0E0D0D0C0B0A}; + __m128i shift0 = {0x0002000200010002, 0x0001000200020001}; + __m128i shift1 = {0x0002000100020002, 0x0002000200010002}; + __m128i shift2 = {0x0001000200020001, 0x0002000100020002}; + + assert((dst_width % 3 == 0) && (dst_width > 0)); + + for (x = 0; x < dst_width; x += 48) { + DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48, + src0, src1, src2, src3); + DUP4_ARG2(__lsx_vld, src_nex, 0, src_nex, 16, src_nex, 32, src_nex, 48, + src4, src5, src6, src7); + DUP4_ARG3(__lsx_vshuf_b, src0, src0, shuff0, src1, src0, shuff1, src1, src1, + shuff2, src2, src2, shuff0, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG3(__lsx_vshuf_b, src3, src2, shuff1, src3, src3, shuff2, src4, src4, + shuff0, src5, src4, shuff1, tmp4, tmp5, tmp6, tmp7); + DUP4_ARG3(__lsx_vshuf_b, src5, src5, shuff2, src6, src6, shuff0, src7, src6, + shuff1, src7, src7, shuff2, tmp8, tmp9, tmp10, tmp11); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp0, const0, tmp1, const1, tmp2, const2, tmp3, + const0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp4, const1, tmp5, const2, tmp6, const0, tmp7, + const1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vdp2_h_bu, tmp8, const2, tmp9, const0, tmp10, const1, + tmp11, const2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vsrar_h, src0, shift0, src1, shift1, src2, shift2, src3, + shift0, src0, src1, src2, src3); + DUP4_ARG2(__lsx_vsrar_h, src4, shift1, src5, shift2, src6, shift0, src7, + shift1, src4, src5, src6, src7); + DUP4_ARG2(__lsx_vsrar_h, tmp0, shift2, tmp1, shift0, tmp2, shift1, tmp3, + shift2, tmp0, tmp1, tmp2, tmp3); + DUP4_ARG2(__lsx_vadd_h, src0, src6, src1, src7, src2, tmp0, src3, tmp1, + src0, src1, src2, src3); + DUP2_ARG2(__lsx_vadd_h, src4, tmp2, src5, tmp3, src4, src5); + DUP2_ARG3(__lsx_vsrarni_b_h, src1, src0, 1, src3, src2, 1, dst0, dst1); + dst2 = __lsx_vsrarni_b_h(src5, src4, 1); + __lsx_vst(dst0, d, 0); + __lsx_vst(dst1, d, 16); + __lsx_vst(dst2, d, 32); + src_ptr += 64; + src_nex += 64; + d += 48; + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)