From bf69adfd645dd3c79254b25b9eaa7b36b2410f3a Mon Sep 17 00:00:00 2001 From: lixia zhang Date: Mon, 6 Aug 2018 23:35:40 -0400 Subject: [PATCH] libyuv:loongson Correct the optimization of mmi on loongson3a platform. When loading or storing the data, the unaligned address will greatly degrade the optimization performance, so non-aligned access instructions are required on the loongson platform. Also delete the optimization function:ScaleARGBFilterCols_MMI, because it degraded the performance. BUG=libyuv:804 R=fbarchard@chromium.org Change-Id: If4c15886a21cdcbac7ae8b336292e4549acf1e47 Reviewed-on: https://chromium-review.googlesource.com/1164627 Reviewed-by: Frank Barchard Commit-Queue: Frank Barchard --- include/libyuv/scale_row.h | 11 -- source/row_mmi.cc | 225 ++++++++++++++++++++++++------------- source/scale_mmi.cc | 184 ++++++++++++++---------------- 3 files changed, 227 insertions(+), 193 deletions(-) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 3042136df..282d5216f 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -115,7 +115,6 @@ extern "C" { #define HAS_FIXEDDIV_MIPS #define HAS_SCALEARGBCOLS_MMI #define HAS_SCALEARGBCOLSUP2_MMI -#define HAS_SCALEARGBFILTERCOLS_MMI #define HAS_SCALEARGBROWDOWN2_MMI #define HAS_SCALEARGBROWDOWNEVEN_MMI #define HAS_SCALEROWDOWN2_MMI @@ -592,21 +591,11 @@ void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, int dst_width, int x, int dx); -void ScaleARGBFilterCols_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx); void ScaleARGBCols_MMI(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); -void ScaleARGBFilterCols_Any_MMI(uint8_t* dst_ptr, - const uint8_t* src_ptr, - int dst_width, - int x, - int dx); void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, diff --git a/source/row_mmi.cc b/source/row_mmi.cc index dab801068..364995298 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -7,10 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include #include "libyuv/row.h" -#include #include // For memcpy and memset. #include "libyuv/basic_types.h" @@ -4492,7 +4490,8 @@ void SobelXRow_MMI(const uint8_t* src_y0, "psubh %[y00], %[y10], %[y20] \n\t" "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "sdc1 %[sobel], 0(%[dst_sobelx]) \n\t" + "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t" + "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t" "daddiu %[src_y0], %[src_y0], 8 \n\t" "daddiu %[src_y1], %[src_y1], 8 \n\t" @@ -4587,7 +4586,8 @@ void SobelYRow_MMI(const uint8_t* src_y0, "psubh %[y00], %[y02], %[y12] \n\t" "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 - "sdc1 %[sobel], 0(%[dst_sobely]) \n\t" + "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t" + "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t" "daddiu %[src_y0], %[src_y0], 8 \n\t" "daddiu %[src_y1], %[src_y1], 8 \n\t" @@ -4624,13 +4624,15 @@ void SobelRow_MMI(const uint8_t* src_sobelx, "punpcklbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" // 255 s1 s1 s1 s55 s0 s0 s0 - "sdc1 %[t1], 0x00(%[dst_argb]) \n\t" + "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t" // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 "punpckhbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" // 255 s3 s3 s3 255 s2 s2 s2 - "sdc1 %[t1], 0x08(%[dst_argb]) \n\t" + "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t" // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 "punpckhbh %[t0], %[t2], %[t2] \n\t" @@ -4638,12 +4640,14 @@ void SobelRow_MMI(const uint8_t* src_sobelx, // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 "punpcklbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" - "sdc1 %[t1], 0x10(%[dst_argb]) \n\t" + "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t" // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 "punpckhbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" - "sdc1 %[t1], 0x18(%[dst_argb]) \n\t" + "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t" + "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t" "daddiu %[dst_argb], %[dst_argb], 32 \n\t" "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" @@ -4665,10 +4669,13 @@ void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, uint64_t tb = 0; __asm__ volatile( "1: \n\t" - "ldc1 %[tr], 0x0(%[src_sobelx]) \n\t" // r=src_sobelx[i] - "ldc1 %[tb], 0x0(%[src_sobely]) \n\t" // b=src_sobely[i] + "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t" + "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i] + "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t" + "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i] "paddusb %[tr], %[tr], %[tb] \n\t" // g - "sdc1 %[tr], 0x0(%[dst_y]) \n\t" + "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t" + "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t" "daddiu %[dst_y], %[dst_y], 8 \n\t" "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" @@ -4705,10 +4712,12 @@ void SobelXYRow_MMI(const uint8_t* src_sobelx, "punpcklbh %[cr], %[tr], %[c1] \n\t" // c1 r1 g1 b1 c0 r0 g0 b0 "punpcklhw %[result], %[gb], %[cr] \n\t" - "sdc1 %[result], 0x00(%[dst_argb]) \n\t" + "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t" // c3 r3 g3 b3 c2 r2 g2 b2 "punpckhhw %[result], %[gb], %[cr] \n\t" - "sdc1 %[result], 0x08(%[dst_argb]) \n\t" + "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t" // g7 b7 g6 b6 g5 b5 g4 b4 "punpckhbh %[gb], %[tb], %[tg] \n\t" @@ -4716,10 +4725,12 @@ void SobelXYRow_MMI(const uint8_t* src_sobelx, "punpckhbh %[cr], %[tr], %[c1] \n\t" // c5 r5 g5 b5 c4 r4 g4 b4 "punpcklhw %[result], %[gb], %[cr] \n\t" - "sdc1 %[result], 0x10(%[dst_argb]) \n\t" + "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t" // c7 r7 g7 b7 c6 r6 g6 b6 "punpckhhw %[result], %[gb], %[cr] \n\t" - "sdc1 %[result], 0x18(%[dst_argb]) \n\t" + "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t" + "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t" "daddiu %[dst_argb], %[dst_argb], 32 \n\t" "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" @@ -4748,12 +4759,14 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { "punpcklhw %[dest], %[src], %[src] \n\t" "and %[dest], %[dest], %[mask0] \n\t" "or %[dest], %[dest], %[mask1] \n\t" - "sdc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "punpckhhw %[dest], %[src], %[src] \n\t" "and %[dest], %[dest], %[mask0] \n\t" "or %[dest], %[dest], %[mask1] \n\t" - "sdc1 %[dest], 0x08(%[dst_ptr]) \n\t" + "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" + "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" @@ -4955,7 +4968,8 @@ void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { "gsldlc1 %[temp], 3(%[src]) \n\t" "gsldrc1 %[temp], -4(%[src]) \n\t" "pshufh %[temp], %[temp], %[shuff] \n\t" - "sdc1 %[temp], 0x0(%[dst]) \n\t" + "gssdrc1 %[temp], 0x0(%[dst]) \n\t" + "gssdlc1 %[temp], 0x7(%[dst]) \n\t" "daddiu %[src], %[src], -0x08 \n\t" "daddiu %[dst], %[dst], 0x08 \n\t" @@ -4975,18 +4989,22 @@ void SplitUVRow_MMI(const uint8_t* src_uv, uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_uv]) \n\t" - "ldc1 %[t1], 0x08(%[src_uv]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t" "and %[t2], %[t0], %[c0] \n\t" "and %[t3], %[t1], %[c0] \n\t" "packushb %[t2], %[t2], %[t3] \n\t" - "sdc1 %[t2], 0x0(%[dst_u]) \n\t" + "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t" "psrlh %[t2], %[t0], %[shift] \n\t" "psrlh %[t3], %[t1], %[shift] \n\t" "packushb %[t2], %[t2], %[t3] \n\t" - "sdc1 %[t2], 0x0(%[dst_v]) \n\t" + "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t" "daddiu %[src_uv], %[src_uv], 16 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" @@ -5008,12 +5026,16 @@ void MergeUVRow_MMI(const uint8_t* src_u, uint64_t temp[3]; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x0(%[src_u]) \n\t" - "ldc1 %[t1], 0x0(%[src_v]) \n\t" + "gsldrc1 %[t0], 0x0(%[src_u]) \n\t" + "gsldlc1 %[t0], 0x7(%[src_u]) \n\t" + "gsldrc1 %[t1], 0x0(%[src_v]) \n\t" + "gsldlc1 %[t1], 0x7(%[src_v]) \n\t" "punpcklbh %[t2], %[t0], %[t1] \n\t" - "sdc1 %[t2], 0x0(%[dst_uv]) \n\t" + "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t" + "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t" "punpckhbh %[t2], %[t0], %[t1] \n\t" - "sdc1 %[t2], 0x8(%[dst_uv]) \n\t" + "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t" + "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t" "daddiu %[src_u], %[src_u], 8 \n\t" "daddiu %[src_v], %[src_v], 8 \n\t" @@ -5149,13 +5171,17 @@ void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, uint64_t src_stride = 0x0; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" - "ldc1 %[t1], 0x00(%[src_stride]) \n\t" - "pavgb %[t0], %[t0], %[t1] \n\t" + "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" + "pavgb %[t0], %[t0], %[t1] \n\t" - "ldc1 %[t2], 0x08(%[src_yuy2]) \n\t" - "ldc1 %[t1], 0x08(%[src_stride]) \n\t" + "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t" + "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" @@ -5167,12 +5193,16 @@ void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, "and %[d0], %[t0], %[c1] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" - "ldc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "ldc1 %[t1], 0x10(%[src_stride]) \n\t" + "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" - "ldc1 %[t2], 0x18(%[src_yuy2]) \n\t" - "ldc1 %[t1], 0x18(%[src_stride]) \n\t" + "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t" + "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" @@ -5186,8 +5216,10 @@ void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" - "sdc1 %[d0], 0x0(%[dst_u]) \n\t" - "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" @@ -5215,8 +5247,10 @@ void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "ldc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "psrlh %[t0], %[t0], %[shift] \n\t" @@ -5226,8 +5260,10 @@ void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, "and %[d0], %[t0], %[c1] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" - "ldc1 %[t0], 0x10(%[src_yuy2]) \n\t" - "ldc1 %[t1], 0x18(%[src_yuy2]) \n\t" + "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "psrlh %[t0], %[t0], %[shift] \n\t" @@ -5239,8 +5275,10 @@ void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" - "sdc1 %[d0], 0x0(%[dst_u]) \n\t" - "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" @@ -5256,17 +5294,19 @@ void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, // Copy row of YUY2 Y's (422) into Y (420/422). void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - // Output a row of UV values, filtering 2 rows of YUY2. uint64_t c0 = 0x00ff00ff00ff00ff; uint64_t temp[2]; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_yuy2]) \n\t" - "ldc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" - "sdc1 %[t0], 0x0(%[dst_y]) \n\t" + "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" + "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" "daddiu %[dst_y], %[dst_y], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" @@ -5292,13 +5332,17 @@ void UYVYToUVRow_MMI(const uint8_t* src_uyvy, uint64_t src_stride = 0x0; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" - "ldc1 %[t1], 0x00(%[src_stride]) \n\t" + "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" - "ldc1 %[t2], 0x08(%[src_uyvy]) \n\t" - "ldc1 %[t1], 0x08(%[src_stride]) \n\t" + "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t" + "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" @@ -5308,12 +5352,16 @@ void UYVYToUVRow_MMI(const uint8_t* src_uyvy, "and %[d0], %[t0], %[c0] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" - "ldc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "ldc1 %[t1], 0x10(%[src_stride]) \n\t" + "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" - "ldc1 %[t2], 0x18(%[src_uyvy]) \n\t" - "ldc1 %[t1], 0x18(%[src_stride]) \n\t" + "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t" + "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" @@ -5325,8 +5373,10 @@ void UYVYToUVRow_MMI(const uint8_t* src_uyvy, "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" - "sdc1 %[d0], 0x0(%[dst_u]) \n\t" - "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" @@ -5354,8 +5404,10 @@ void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "ldc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" @@ -5363,8 +5415,10 @@ void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, "and %[d0], %[t0], %[c0] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" - "ldc1 %[t0], 0x10(%[src_uyvy]) \n\t" - "ldc1 %[t1], 0x18(%[src_uyvy]) \n\t" + "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t" + "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" @@ -5374,8 +5428,10 @@ void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" - "sdc1 %[d0], 0x0(%[dst_u]) \n\t" - "sdc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" + "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" + "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" @@ -5397,15 +5453,18 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { uint64_t temp[2]; __asm__ volatile( "1: \n\t" - "ldc1 %[t0], 0x00(%[src_uyvy]) \n\t" - "ldc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" + "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" + "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" + "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" "dsrl %[t0], %[t0], %[shift] \n\t" "dsrl %[t1], %[t1], %[shift] \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" - "sdc1 %[t0], 0x0(%[dst_y]) \n\t" + "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" + "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" "daddiu %[dst_y], %[dst_y], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" @@ -5670,19 +5729,22 @@ void InterpolateRow_MMI(uint8_t* dst_ptr, uint64_t uv = 0x0; uint64_t uv_stride = 0x0; __asm__ volatile( - "1: \n\t" - "ldc1 %[uv], 0x0(%[src_ptr]) \n\t" - "daddu $t0, %[src_ptr], %[stride] \n\t" - "ldc1 %[uv_stride], 0x0($t0) \n\t" + "1: \n\t" + "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t" + "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t" + "daddu $t0, %[src_ptr], %[stride] \n\t" + "gsldrc1 %[uv_stride], 0x0($t0) \n\t" + "gsldlc1 %[uv_stride], 0x7($t0) \n\t" - "pavgb %[uv], %[uv], %[uv_stride] \n\t" - "sdc1 %[uv], 0x0(%[dst_ptr]) \n\t" + "pavgb %[uv], %[uv], %[uv_stride] \n\t" + "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t" + "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t" - "daddiu %[src_ptr], %[src_ptr], 8 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" - "daddiu %[width], %[width], -8 \n\t" - "bgtz %[width], 1b \n\t" - "nop \n\t" + "daddiu %[src_ptr], %[src_ptr], 8 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" + "daddiu %[width], %[width], -8 \n\t" + "bgtz %[width], 1b \n\t" + "nop \n\t" : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), [stride] "r"((int64_t)src_stride) @@ -5700,10 +5762,12 @@ void InterpolateRow_MMI(uint8_t* dst_ptr, "pshufh %[fy1], %[fy1], %[zero] \n\t" "psubh %[fy0], %[fy0], %[fy1] \n\t" "1: \n\t" - "ldc1 %[t0], 0x0(%[src_ptr]) \n\t" + "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t" + "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t" "punpcklbh %[d0], %[t0], %[zero] \n\t" "punpckhbh %[d1], %[t0], %[zero] \n\t" - "ldc1 %[t0], 0x0(%[src_ptr1]) \n\t" + "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t" + "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t" "punpcklbh %[d2], %[t0], %[zero] \n\t" "punpckhbh %[d3], %[t0], %[zero] \n\t" @@ -5720,7 +5784,8 @@ void InterpolateRow_MMI(uint8_t* dst_ptr, "psrlh %[d1], %[d1], %[shift] \n\t" "packushb %[d0], %[d0], %[d1] \n\t" - "sdc1 %[d0], 0x0(%[dst_ptr]) \n\t" + "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t" + "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 8 \n\t" "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" diff --git a/source/scale_mmi.cc b/source/scale_mmi.cc index 4757d8997..e12c6bb79 100644 --- a/source/scale_mmi.cc +++ b/source/scale_mmi.cc @@ -38,10 +38,12 @@ void ScaleRowDown2_MMI(const uint8_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "psrlh %[src0], %[src0], %[shift] \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "psrlh %[src1], %[src1], %[shift] \n\t" "packushb %[dest], %[src0], %[src1] \n\t" @@ -72,9 +74,11 @@ void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "and %[dest0], %[src0], %[mask] \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "and %[dest1], %[src1], %[mask] \n\t" "packushb %[dest0], %[dest0], %[dest1] \n\t" @@ -114,11 +118,13 @@ void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[s0], 0x00(%[s]) \n\t" + "gsldrc1 %[s0], 0x00(%[s]) \n\t" + "gsldlc1 %[s0], 0x07(%[s]) \n\t" "psrlh %[s1], %[s0], %[shift1] \n\t" "and %[s0], %[s0], %[mask] \n\t" - "ldc1 %[t0], 0x00(%[t]) \n\t" + "gsldrc1 %[t0], 0x00(%[t]) \n\t" + "gsldlc1 %[t0], 0x07(%[t]) \n\t" "psrlh %[t1], %[t0], %[shift1] \n\t" "and %[t0], %[t0], %[mask] \n\t" @@ -128,11 +134,13 @@ void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, "paddh %[dest0], %[dest0], %[ph] \n\t" "psrlh %[dest0], %[dest0], %[shift0] \n\t" - "ldc1 %[s0], 0x08(%[s]) \n\t" + "gsldrc1 %[s0], 0x08(%[s]) \n\t" + "gsldlc1 %[s0], 0x0f(%[s]) \n\t" "psrlh %[s1], %[s0], %[shift1] \n\t" "and %[s0], %[s0], %[mask] \n\t" - "ldc1 %[t0], 0x08(%[t]) \n\t" + "gsldrc1 %[t0], 0x08(%[t]) \n\t" + "gsldlc1 %[t0], 0x0f(%[t]) \n\t" "psrlh %[t1], %[t0], %[shift1] \n\t" "and %[t0], %[t0], %[mask] \n\t" @@ -172,8 +180,10 @@ void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "punpckhwd %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" @@ -237,12 +247,14 @@ void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, __asm__ volatile( "1: \n\t" - "ldc1 %[s0], 0x00(%[s]) \n\t" + "gsldrc1 %[s0], 0x00(%[s]) \n\t" + "gsldlc1 %[s0], 0x07(%[s]) \n\t" "punpcklbh %[s_lo], %[s0], %[mask] \n\t" "punpckhbh %[s_hi], %[s0], %[mask] \n\t" "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" - "ldc1 %[t0], 0x00(%[t]) \n\t" + "gsldrc1 %[t0], 0x00(%[t]) \n\t" + "gsldlc1 %[t0], 0x07(%[t]) \n\t" "punpcklbh %[t_lo], %[t0], %[mask] \n\t" "punpckhbh %[t_hi], %[t0], %[mask] \n\t" "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" @@ -251,12 +263,14 @@ void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" - "ldc1 %[s0], 0x08(%[s]) \n\t" + "gsldrc1 %[s0], 0x08(%[s]) \n\t" + "gsldlc1 %[s0], 0x0f(%[s]) \n\t" "punpcklbh %[s_lo], %[s0], %[mask] \n\t" "punpckhbh %[s_hi], %[s0], %[mask] \n\t" "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" - "ldc1 %[t0], 0x08(%[t]) \n\t" + "gsldrc1 %[t0], 0x08(%[t]) \n\t" + "gsldlc1 %[t0], 0x0f(%[t]) \n\t" "punpcklbh %[t_lo], %[t0], %[mask] \n\t" "punpckhbh %[t_hi], %[t0], %[mask] \n\t" "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" @@ -293,10 +307,12 @@ void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "psrlw %[src0], %[src0], %[shift] \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "psrlw %[src1], %[src1], %[shift] \n\t" "packsswh %[dest], %[src0], %[src1] \n\t" @@ -324,8 +340,10 @@ void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" @@ -364,11 +382,13 @@ void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[s0], 0x00(%[s]) \n\t" + "gsldrc1 %[s0], 0x00(%[s]) \n\t" + "gsldlc1 %[s0], 0x07(%[s]) \n\t" "psrlw %[s1], %[s0], %[shift0] \n\t" "and %[s0], %[s0], %[mask] \n\t" - "ldc1 %[t0], 0x00(%[t]) \n\t" + "gsldrc1 %[t0], 0x00(%[t]) \n\t" + "gsldlc1 %[t0], 0x07(%[t]) \n\t" "psrlw %[t1], %[t0], %[shift0] \n\t" "and %[t0], %[t0], %[mask] \n\t" @@ -378,11 +398,13 @@ void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, "paddw %[dest0], %[dest0], %[ph] \n\t" "psrlw %[dest0], %[dest0], %[shift1] \n\t" - "ldc1 %[s0], 0x08(%[s]) \n\t" + "gsldrc1 %[s0], 0x08(%[s]) \n\t" + "gsldlc1 %[s0], 0x0f(%[s]) \n\t" "psrlw %[s1], %[s0], %[shift0] \n\t" "and %[s0], %[s0], %[mask] \n\t" - "ldc1 %[t0], 0x08(%[t]) \n\t" + "gsldrc1 %[t0], 0x08(%[t]) \n\t" + "gsldlc1 %[t0], 0x0f(%[t]) \n\t" "psrlw %[t1], %[t0], %[shift0] \n\t" "and %[t0], %[t0], %[mask] \n\t" @@ -425,18 +447,22 @@ void ScaleRowDown4_MMI(const uint8_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "psrlw %[src0], %[src0], %[shift] \n\t" "and %[src0], %[src0], %[mask] \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "psrlw %[src1], %[src1], %[shift] \n\t" "and %[src1], %[src1], %[mask] \n\t" "packsswh %[dest_lo], %[src0], %[src1] \n\t" - "ldc1 %[src0], 0x10(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" "psrlw %[src0], %[src0], %[shift] \n\t" "and %[src0], %[src0], %[mask] \n\t" - "ldc1 %[src1], 0x18(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" "psrlw %[src1], %[src1], %[shift] \n\t" "and %[src1], %[src1], %[mask] \n\t" "packsswh %[dest_hi], %[src0], %[src1] \n\t" @@ -469,13 +495,17 @@ void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src_ptr]) \n\t" - "ldc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" - "ldc1 %[src0], 0x10(%[src_ptr]) \n\t" - "ldc1 %[src1], 0x18(%[src_ptr]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" + "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" + "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" @@ -691,7 +721,8 @@ void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "punpcklhw %[dest], %[src], %[src] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" @@ -721,9 +752,11 @@ void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { "punpcklbh %[src_lo], %[src], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[mask] \n\t" - "ldc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "paddush %[dest0], %[dest0], %[src_lo] \n\t" - "ldc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "paddush %[dest1], %[dest1], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" @@ -750,16 +783,19 @@ void ScaleAddRow_16_MMI(const uint16_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "punpcklhw %[src_lo], %[src], %[mask] \n\t" "punpckhhw %[src_hi], %[src], %[mask] \n\t" - "ldc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" + "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "paddw %[dest0], %[dest0], %[src_lo] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" - "ldc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" + "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "paddw %[dest1], %[dest1], %[src_hi] \n\t" "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" @@ -922,7 +958,8 @@ void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, __asm__ volatile( "1: \n\t" - "ldc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" + "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "punpcklwd %[dest0], %[src], %[src] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" @@ -939,67 +976,6 @@ void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, : "memory"); } -void ScaleARGBFilterCols_MMI(uint8_t* dst_argb, - const uint8_t* src_argb, - int dst_width, - int x, - int dx) { - uint64_t dest, src, src_hi, src_lo; - int xi, xf, nxf; - int64_t fxf, fnxf; - - const uint8_t* src_ptr = src_argb; - - const uint64_t mask0 = 0; - const uint64_t mask1 = 0x7fULL; - - const uint64_t shift2 = 2; - const uint64_t shift9 = 9; - const uint64_t shift7 = 7; - const uint64_t shift16 = 16; - - __asm__ volatile( - "1: \n\t" - "dsrl %[xi], %[x], %[shift16] \n\t" - "dsll %[xi], %[xi], %[shift2] \n\t" - - "dadd %[src_ptr], %[src_argb], %[xi] \n\t" - "ldc1 %[src], 0x00(%[src_ptr]) \n\t" - "punpcklbh %[src_lo], %[src], %[mask0] \n\t" - "punpckhbh %[src_hi], %[src], %[mask0] \n\t" - - "dsrl %[xf], %[x], %[shift9] \n\t" - "andi %[xf], %[xf], 0x7f \n\t" - "xori %[nxf], %[xf], 0x7f \n\t" - "dmtc1 %[xf], %[fxf] \n\t" - "pshufh %[fxf], %[fxf], %[mask0] \n\t" - "dmtc1 %[nxf], %[fnxf] \n\t" - "pshufh %[fnxf], %[fnxf], %[mask0] \n\t" - - "pmullh %[src_lo], %[src_lo], %[fnxf] \n\t" - "pmullh %[src_hi], %[src_hi], %[fxf] \n\t" - "paddh %[dest], %[src_lo], %[src_hi] \n\t" - "psrlh %[dest], %[dest], %[shift7] \n\t" - "packushb %[dest], %[dest], %[mask0] \n\t" - - "dadd %[x], %[x], %[dx] \n\t" - - "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" - - "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" - "daddi %[width], %[width], -0x01 \n\t" - "bnez %[width], 1b \n\t" - : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), - [src_lo] "=&f"(src_lo), [fxf] "=&f"(fxf), [fnxf] "=&f"(fnxf), - [xi] "=&r"(xi), [xf] "=&r"(xf), [nxf] "=&r"(nxf) - : [src_argb] "r"(src_argb), [src_ptr] "r"(src_ptr), - [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), [x] "r"(x), - [dx] "r"(dx), [mask0] "f"(mask0), [mask1] "f"(mask1), - [shift2] "r"(shift2), [shift7] "f"(shift7), [shift9] "r"(shift9), - [shift16] "r"(shift16) - : "memory"); -} - // Divide num by div and return as 16.16 fixed point result. /* LibYUVBaseTest.TestFixedDiv */ int FixedDiv_MIPS(int num, int div) { @@ -1058,9 +1034,11 @@ void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, __asm__ volatile( "1: \n\t" - "ldc1 %[src0], 0x00(%[src1_ptr]) \n\t" + "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t" + "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t" "pmaddhw %[dest04], %[src0], %[mask0] \n\t" - "ldc1 %[src1], 0x00(%[src2_ptr]) \n\t" + "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t" + "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t" "pmaddhw %[dest], %[src1], %[mask1] \n\t" "paddw %[dest04], %[dest04], %[dest] \n\t" "paddw %[dest04], %[dest04], %[ph] \n\t" @@ -1072,9 +1050,11 @@ void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, "paddw %[dest15], %[dest15], %[ph] \n\t" "psrlw %[dest15], %[dest15], %[shift] \n\t" - "ldc1 %[src0], 0x02(%[src1_ptr]) \n\t" + "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t" + "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t" "pmaddhw %[dest26], %[src0], %[mask0] \n\t" - "ldc1 %[src1], 0x02(%[src2_ptr]) \n\t" + "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t" + "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t" "pmaddhw %[dest], %[src1], %[mask1] \n\t" "paddw %[dest26], %[dest26], %[dest] \n\t" "paddw %[dest26], %[dest26], %[ph] \n\t"