From ce5b333853c719a7d868fe08fc8fe7a9e6c56079 Mon Sep 17 00:00:00 2001 From: Shiyou Yin Date: Mon, 25 May 2020 22:02:15 +0800 Subject: [PATCH] ARGBToI420 MMI and MSA version match C. In commit 0b8bb6, C version has been updated. This patch update the MMI and MSA version to mach C version. Change-Id: Ib28da3629a8465990c8e2185278a95af8c27a31d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2227754 Commit-Queue: Frank Barchard Reviewed-by: Frank Barchard --- source/row_any.cc | 6 +- source/row_mmi.cc | 230 +++++++++++++++++++++++------------- source/row_msa.cc | 295 ++++++++++++++++++++++++---------------------- 3 files changed, 310 insertions(+), 221 deletions(-) diff --git a/source/row_any.cc b/source/row_any.cc index 61cc7cd87..933b9c9d7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1439,7 +1439,7 @@ ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MSA -ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31) +ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MMI ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) @@ -1448,7 +1448,7 @@ ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MSA -ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31) +ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MMI ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) @@ -1457,7 +1457,7 @@ ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MSA -ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31) +ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MMI ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) diff --git a/source/row_mmi.cc b/source/row_mmi.cc index d4afceb89..57c70a36a 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -690,12 +690,15 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -709,7 +712,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" @@ -727,7 +731,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -754,7 +759,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" @@ -772,7 +778,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -799,7 +806,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" @@ -817,7 +825,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -844,7 +853,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" @@ -862,7 +872,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -900,11 +911,12 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -994,12 +1006,15 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1013,7 +1028,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" @@ -1031,7 +1047,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1058,7 +1075,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" @@ -1076,7 +1094,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1103,7 +1122,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" @@ -1121,7 +1141,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1148,7 +1169,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" @@ -1166,7 +1188,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" @@ -1204,11 +1227,12 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -1298,12 +1322,15 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002F00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1317,7 +1344,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" @@ -1335,7 +1363,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1362,7 +1391,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" @@ -1380,7 +1410,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1407,7 +1438,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" @@ -1425,7 +1457,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1452,7 +1485,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" @@ -1470,7 +1504,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -1508,11 +1543,12 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -1602,12 +1638,15 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1621,7 +1660,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" @@ -1639,7 +1679,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1666,7 +1707,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" @@ -1684,7 +1726,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1711,7 +1754,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" @@ -1729,7 +1773,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1756,7 +1801,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" @@ -1774,7 +1820,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" @@ -1812,11 +1859,12 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -1910,12 +1958,15 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x0026004a00700002; - const uint64_t mask_v = 0x00020070005e0012; + const uint64_t mask_u = 0x0013002500380002; + const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -1931,7 +1982,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" @@ -1951,7 +2003,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -1980,7 +2033,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" @@ -2000,7 +2054,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2029,7 +2084,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" @@ -2049,7 +2105,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2078,7 +2135,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" @@ -2098,7 +2156,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" @@ -2136,11 +2195,12 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -2234,12 +2294,15 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, uint8_t* dst_v, int width) { uint64_t src_rgb1; - uint64_t ftmp[12]; + uint64_t ftmp[13]; + uint64_t tmp[1]; const uint64_t value = 0x4040; - const uint64_t mask_u = 0x00020070004a0026; - const uint64_t mask_v = 0x0012005e00700002; + const uint64_t mask_u = 0x0002003800250013; + const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( + "dli %[tmp0], 0x0001000100010001 \n\t" + "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" @@ -2255,7 +2318,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" @@ -2275,7 +2339,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2304,7 +2369,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" @@ -2324,7 +2390,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2353,7 +2420,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" @@ -2373,7 +2441,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2402,7 +2471,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" @@ -2422,7 +2492,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" - "psrlh %[src0], %[src0], %[two] \n\t" + "paddh %[src0], %[src0], %[ftmp12] \n\t" + "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" @@ -2460,11 +2531,12 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), - [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) + [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), + [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } @@ -2764,7 +2836,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), - [zero] "f"(0x00), [eight] "f"(0x08), [two] "f"(0x02), + [zero] "f"(0x00), [eight] "f"(0x08), [sixteen] "f"(0x10) : "memory"); } diff --git a/source/row_msa.cc b/source/row_msa.cc index 7ca34daa7..ff7e187cd 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -155,11 +155,10 @@ extern "C" { } // Loads current and next row of ARGB input and averages it to calculate U and V -#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ +#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ { \ v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ - v16u8 vec8_m, vec9_m; \ v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ v8u16 reg8_m, reg9_m; \ \ @@ -195,53 +194,16 @@ extern "C" { reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ - reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ - argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ - argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ - src0_m = (v16u8)__msa_ld_b((void*)s, 64); \ - src1_m = (v16u8)__msa_ld_b((void*)s, 80); \ - src2_m = (v16u8)__msa_ld_b((void*)s, 96); \ - src3_m = (v16u8)__msa_ld_b((void*)s, 112); \ - src4_m = (v16u8)__msa_ld_b((void*)t, 64); \ - src5_m = (v16u8)__msa_ld_b((void*)t, 80); \ - src6_m = (v16u8)__msa_ld_b((void*)t, 96); \ - src7_m = (v16u8)__msa_ld_b((void*)t, 112); \ - vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ - vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ - vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ - vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ - vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ - vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ - vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ - vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ - reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ - reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ - reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ - reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ - reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ - reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ - reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ - reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ - reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ - reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ - reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ - reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ - reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ - reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ - reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ - reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ - argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ - argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ + reg8_m += const_0x0101; \ + reg9_m += const_0x0101; \ + reg0_m += const_0x0101; \ + reg1_m += const_0x0101; \ + argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ + argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ + argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ + argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ } -// Takes ARGB input and calculates U and V. #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ shf0, shf1, shf2, shf3, v_out, u_out) \ { \ @@ -272,6 +234,39 @@ extern "C" { u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ } +// Takes ARGB input and calculates U and V. +#define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ + shf0, shf1, shf2, shf3, v_out, u_out) \ + { \ + v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ + v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ + \ + vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ + vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ + vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ + vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ + vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ + vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ + vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ + vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ + reg0_m = __msa_dotp_u_w(vec0_m, const1); \ + reg1_m = __msa_dotp_u_w(vec1_m, const1); \ + reg2_m = __msa_dotp_u_w(vec4_m, const1); \ + reg3_m = __msa_dotp_u_w(vec5_m, const1); \ + reg0_m += (v4u32)const3; \ + reg1_m += (v4u32)const3; \ + reg2_m += (v4u32)const3; \ + reg3_m += (v4u32)const3; \ + reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ + reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ + reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ + reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ + u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ + v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ + u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ + v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ + } + // Load I444 pixel data #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ { \ @@ -839,12 +834,13 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; v16u8 dst0, dst1; - v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); @@ -903,12 +899,18 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, reg3 += __msa_hadd_u_h(vec5, vec5); reg4 += __msa_hadd_u_h(vec0, vec0); reg5 += __msa_hadd_u_h(vec1, vec1); - reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); - reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); - reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); - reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); - reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); - reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg4 += const_0x0001; + reg5 += const_0x0001; + reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); + reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); + reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); + reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); + reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); + reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); reg6 = reg0 * const_0x70; reg7 = reg1 * const_0x70; reg8 = reg2 * const_0x4A; @@ -2045,12 +2047,13 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; @@ -2099,10 +2102,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 = __msa_srai_h((v8i16)reg0, 2); - reg1 = __msa_srai_h((v8i16)reg1, 2); - reg2 = __msa_srai_h((v8i16)reg2, 2); - reg3 = __msa_srai_h((v8i16)reg3, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h((v8i16)reg0, 1); + reg1 = __msa_srai_h((v8i16)reg1, 1); + reg2 = __msa_srai_h((v8i16)reg2, 1); + reg3 = __msa_srai_h((v8i16)reg3, 1); vec4 = (v8u16)__msa_pckev_h(reg1, reg0); vec5 = (v8u16)__msa_pckev_h(reg3, reg2); vec6 = (v8u16)__msa_pckod_h(reg1, reg0); @@ -2150,12 +2157,13 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0, v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; - v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); - v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); - v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); - v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); - v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); + v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); + v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); + v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); + v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); + v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; @@ -2204,10 +2212,14 @@ void RAWToUVRow_MSA(const uint8_t* src_rgb0, reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); - reg0 = __msa_srai_h(reg0, 2); - reg1 = __msa_srai_h(reg1, 2); - reg2 = __msa_srai_h(reg2, 2); - reg3 = __msa_srai_h(reg3, 2); + reg0 += const_0x0001; + reg1 += const_0x0001; + reg2 += const_0x0001; + reg3 += const_0x0001; + reg0 = __msa_srai_h(reg0, 1); + reg1 = __msa_srai_h(reg1, 1); + reg2 = __msa_srai_h(reg2, 1); + reg3 = __msa_srai_h(reg3, 1); vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); @@ -2588,28 +2600,30 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 dst0, dst1, vec0, vec1, vec2, vec3; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); - v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); - v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; + v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, vec0, vec1, vec2, vec3); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, - const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_v); - ST_UB(dst1, dst_u); - s += 128; - t += 128; - dst_v += 16; - dst_u += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, + shuffler2, shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } @@ -2621,29 +2635,30 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 src0, src1, src2, src3; + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; - v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); - v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; + v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; + v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; + v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, src0, src1, src2, src3); - ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, - const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_u += 16; - dst_v += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, + shuffler2, shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } } @@ -2655,28 +2670,30 @@ void RGBAToUVRow_MSA(const uint8_t* src_rgb0, int x; const uint8_t* s = src_rgb0; const uint8_t* t = src_rgb0 + src_stride_rgb; - v16u8 dst0, dst1, vec0, vec1, vec2, vec3; - v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; - v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, - 18, 19, 22, 23, 26, 27, 30, 31}; - v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; - v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; - v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); - v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); - v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); - v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); + const uint8_t unused = 0xf; + v8u16 src0, src1, src2, src3; + v16u8 dst0, dst1; + v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; + v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; + v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; + v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; + v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); + v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); + v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); + v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); + v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); - for (x = 0; x < width; x += 32) { - READ_ARGB(s, t, vec0, vec1, vec2, vec3); - ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, - const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, - dst1); - ST_UB(dst0, dst_u); - ST_UB(dst1, dst_v); - s += 128; - t += 128; - dst_u += 16; - dst_v += 16; + for (x = 0; x < width; x += 16) { + READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); + ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, + const_0x250013, const_0x008080, shuffler0, shuffler1, + shuffler2, shuffler3, dst0, dst1); + *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); + *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); + s += 64; + t += 64; + dst_u += 8; + dst_v += 8; } }