From b1504a8e48acbb0d1fee6f1c0fe3851ab71348fb Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 18 Nov 2016 15:05:10 -0800 Subject: [PATCH] Add MSA optimized ARGBToRGB24Row_MSA and ARGBToRAWRow_MSA functions R=fbarchard@google.com BUG=libyuv:634 Review URL: https://codereview.chromium.org/2487913004 . --- include/libyuv/row.h | 6 +++ source/convert_from_argb.cc | 16 +++++++ source/row_any.cc | 4 ++ source/row_msa.cc | 84 +++++++++++++++++++++++++++++-------- 4 files changed, 92 insertions(+), 18 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7b49fe3d5..8f9bc6cd5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -384,6 +384,8 @@ extern "C" { #define HAS_I422TORGBAROW_MSA #define HAS_I422ALPHATOARGBROW_MSA #define HAS_I422TORGB24ROW_MSA +#define HAS_ARGBTORGB24ROW_MSA +#define HAS_ARGBTORAWROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -1304,6 +1306,8 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width); +void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width); @@ -1892,6 +1896,8 @@ void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width); +void ARGBToRGB24Row_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToRAWRow_Any_MSA(const uint8* src_argb, uint8* dst_rgb, int width); void I444ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 03da3b6e2..eb8bd4459 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -841,6 +841,14 @@ int ARGBToRGB24(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); @@ -891,6 +899,14 @@ int ARGBToRAW(const uint8* src_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBToRAWRow = ARGBToRAWRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); diff --git a/source/row_any.cc b/source/row_any.cc index b1c86a294..6d068ecce 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -380,6 +380,10 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif +#if defined(HAS_ARGBTORGB24ROW_MSA) +ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) +#endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif diff --git a/source/row_msa.cc b/source/row_msa.cc index 32d296391..130bc2d2c 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -229,12 +229,12 @@ void I422ToARGBRow_MSA(const uint8* src_y, } } -void YUVTORGBARow_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int width) { +void I422ToRGBARow_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; @@ -289,12 +289,12 @@ void I422AlphaToARGBRow_MSA(const uint8* src_y, } } -void YUVTORGB24Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* rgb_buf, - const struct YuvConstants* yuvconstants, - int32 width) { +void I422ToRGB24Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int32 width) { int x; int64 data_u, data_v; v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; @@ -340,12 +340,12 @@ void YUVTORGB24Row_MSA(const uint8* src_y, } // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. -void YUVTORGB565Row_MSA(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width) { +void I422ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width) { int x; v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; @@ -740,6 +740,54 @@ void ARGBToUVRow_MSA(const uint8* src_argb0, } } +void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; + v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, + 16, 17, 18, 20, 21, 22, 24, 25}; + v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, + 21, 22, 24, 25, 26, 28, 29, 30}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + +void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2; + v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; + v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, + 18, 17, 16, 22, 21, 20, 26, 25}; + v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, + 21, 20, 26, 25, 24, 30, 29, 28}; + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); + dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); + dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_rgb, 16); + ST_UB(dst2, (dst_rgb + 32)); + src_argb += 64; + dst_rgb += 48; + } +} + void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, uint8* dst_argb, int width) {