From 78e44628c61977523dd6ddc94e2f668cfc8f1efe Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 17 Aug 2017 11:36:48 -0700 Subject: [PATCH] Add MSA optimized SplitUV, Set, MirrorUV, SobelX and SobelY row functions. TBR=kjellander@chromium.org R=fbarchard@google.com Bug:libyuv:634 Change-Id: Ie2342f841f1bb8469fc4631b784eddd804f5d53e Reviewed-on: https://chromium-review.googlesource.com/616765 Reviewed-by: Frank Barchard --- include/libyuv/row.h | 24 ++++++ source/planar_functions.cc | 39 +++++++++ source/rotate.cc | 5 ++ source/row_any.cc | 3 + source/row_common.cc | 2 +- source/row_msa.cc | 167 +++++++++++++++++++++++++++++++++++++ source/scale_neon64.cc | 32 +++---- unit_test/scale_test.cc | 31 ++----- 8 files changed, 261 insertions(+), 42 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 164433e6b..e9ce278b5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -422,6 +422,8 @@ extern "C" { #define HAS_YUY2TOUVROW_MSA #define HAS_YUY2TOYROW_MSA #define HAS_ARGBEXTRACTALPHAROW_MSA +#define HAS_SPLITUVROW_MSA +#define HAS_MIRRORUVROW_MSA #ifndef DISABLE_CLANG_MSA #define HAS_ABGRTOUVROW_MSA @@ -467,6 +469,9 @@ extern "C" { #define HAS_ARGBBLENDROW_MSA #define HAS_ARGBQUANTIZEROW_MSA #define HAS_ARGBCOLORMATRIXROW_MSA +#define HAS_SETROW_MSA +#define HAS_SOBELXROW_MSA +#define HAS_SOBELYROW_MSA #endif #endif @@ -1362,6 +1367,10 @@ void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); +void MirrorUVRow_MSA(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width); void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width); @@ -1391,6 +1400,7 @@ void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); +void SplitUVRow_MSA(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, @@ -1407,6 +1417,10 @@ void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width); +void SplitUVRow_Any_MSA(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width); void MergeUVRow_C(const uint8* src_u, const uint8* src_v, @@ -1496,6 +1510,7 @@ void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, int width); void SetRow_C(uint8* dst, uint8 v8, int count); +void SetRow_MSA(uint8* dst, uint8 v8, int count); void SetRow_X86(uint8* dst, uint8 v8, int count); void SetRow_ERMS(uint8* dst, uint8 v8, int count); void SetRow_NEON(uint8* dst, uint8 v8, int count); @@ -3024,6 +3039,11 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelXRow_MSA(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int width); void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, @@ -3036,6 +3056,10 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width); +void SobelYRow_MSA(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int width); void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 1bd89f156..2e363ba7a 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -321,6 +321,14 @@ void SplitUVPlane(const uint8* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { // Copy a row of UV. @@ -1579,6 +1587,11 @@ void SetPlane(uint8* dst_y, SetRow = SetRow_ERMS; } #endif +#if defined(HAS_SETROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { + SetRow = SetRow_MSA; + } +#endif // Set plane for (y = 0; y < height; ++y) { @@ -2634,6 +2647,11 @@ static int ARGBSobelize(const uint8* src_argb, SobelYRow = SobelYRow_NEON; } #endif +#if defined(HAS_SOBELYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelYRow = SobelYRow_MSA; + } +#endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; @@ -2643,6 +2661,11 @@ static int ARGBSobelize(const uint8* src_argb, if (TestCpuFlag(kCpuHasNEON)) { SobelXRow = SobelXRow_NEON; } +#endif +#if defined(HAS_SOBELXROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXRow = SobelXRow_MSA; + } #endif { // 3 rows with edges before/after. @@ -3181,6 +3204,14 @@ int YUY2ToNV12(const uint8* src_yuy2, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -3289,6 +3320,14 @@ int UYVYToNV12(const uint8* src_uyvy, } } #endif +#if defined(HAS_SPLITUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SplitUVRow = SplitUVRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_MSA; + } + } +#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; diff --git a/source/rotate.cc b/source/rotate.cc index 4330884ce..b16af5071 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -361,6 +361,11 @@ void RotateUV180(const uint8* src, MirrorUVRow = MirrorUVRow_DSPR2; } #endif +#if defined(HAS_MIRRORUVROW_MSA) + if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { + MirrorUVRow = MirrorUVRow_MSA; + } +#endif dst_a += dst_stride_a * (height - 1); dst_b += dst_stride_b * (height - 1); diff --git a/source/row_any.cc b/source/row_any.cc index c02ab7955..4511be4b8 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -914,6 +914,9 @@ ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #ifdef HAS_SPLITUVROW_DSPR2 ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15) #endif +#ifdef HAS_SPLITUVROW_MSA +ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) +#endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index c9f71b851..436e6dd3f 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2643,7 +2643,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; int i; #if defined(__clang__) - #pragma clang loop vectorize_width(4) +#pragma clang loop vectorize_width(4) #endif for (i = 0; i < width; ++i) { float v = *src++; diff --git a/source/row_msa.cc b/source/row_msa.cc index 43049644d..83b6a6a55 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -3291,6 +3291,173 @@ void ARGBColorMatrixRow_MSA(const uint8* src_argb, } } #endif + +void SplitUVRow_MSA(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32); + src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_u, 16); + ST_UB2(dst2, dst3, dst_v, 16); + src_uv += 64; + dst_u += 32; + dst_v += 32; + } +} + +#ifndef DISABLE_CLANG_MSA +void SetRow_MSA(uint8* dst, uint8 v8, int width) { + int x; + v16u8 dst0 = (v16u8)__msa_fill_b(v8); + + for (x = 0; x < width; x += 16) { + ST_UB(dst0, dst); + dst += 16; + } +} +#endif + +void MirrorUVRow_MSA(const uint8* src_uv, + uint8* dst_u, + uint8* dst_v, + int width) { + int x; + v16u8 src0, src1, src2, src3; + v16u8 dst0, dst1, dst2, dst3; + v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; + v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; + + src_uv += (2 * width); + + for (x = 0; x < width; x += 32) { + src_uv -= 64; + src2 = (v16u8)__msa_ld_b((v16i8*)src_uv, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_uv, 16); + src0 = (v16u8)__msa_ld_b((v16i8*)src_uv, 32); + src1 = (v16u8)__msa_ld_b((v16i8*)src_uv, 48); + dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + ST_UB2(dst0, dst1, dst_v, 16); + ST_UB2(dst2, dst3, dst_u, 16); + dst_u += 32; + dst_v += 32; + } +} + +#ifndef DISABLE_CLANG_MSA +void SobelXRow_MSA(const uint8* src_y0, + const uint8* src_y1, + const uint8* src_y2, + uint8* dst_sobelx, + int32 width) { + int x; + v16u8 src0, src1, src2, src3, src4, src5, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5; + v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; + v16i8 tmp = __msa_ldi_b(8); + v16i8 mask1 = mask0 + tmp; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_y0, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_y1, 16); + src4 = (v16u8)__msa_ld_b((v16i8*)src_y2, 0); + src5 = (v16u8)__msa_ld_b((v16i8*)src_y2, 16); + vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); + vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); + vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); + vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); + vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); + vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); + vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); + vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); + vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); + vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); + vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); + vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobelx); + src_y0 += 16; + src_y1 += 16; + src_y2 += 16; + dst_sobelx += 16; + } +} + +void SobelYRow_MSA(const uint8* src_y0, + const uint8* src_y1, + uint8* dst_sobely, + int32 width) { + int x; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; + v8i16 zero = {0}; + v8i16 max = __msa_ldi_h(255); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_y0, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_y1, 0); + vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); + vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); + vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); + vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); + vec0 -= vec2; + vec1 -= vec3; + vec6[0] = src_y0[16] - src_y1[16]; + vec6[1] = src_y0[17] - src_y1[17]; + vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); + vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); + vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); + vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); + vec0 += vec2; + vec1 += vec3; + vec4 += vec2; + vec5 += vec3; + vec0 += vec4; + vec1 += vec5; + vec0 = __msa_add_a_h(zero, vec0); + vec1 = __msa_add_a_h(zero, vec1); + vec0 = __msa_maxi_s_h(vec0, 0); + vec1 = __msa_maxi_s_h(vec1, 0); + vec0 = __msa_min_s_h(max, vec0); + vec1 = __msa_min_s_h(max, vec1); + dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); + ST_UB(dst0, dst_sobely); + src_y0 += 16; + src_y1 += 16; + dst_sobely += 16; + } +} +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index e1da7edb6..36318a9b1 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -1042,39 +1042,40 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr, "ld2 {v0.4h, v1.4h}, [%0], %4 \n" // load row 1 even pixels "ld2 {v2.4h, v3.4h}, [%1], %4 \n" // load row 2 -// consider a variation of this for last 8x2 that replicates the last pixel. + // consider a variation of this for last 8x2 that replicates the last + // pixel. "ld2 {v4.4h, v5.4h}, [%0], %5 \n" // load row 1 odd pixels "ld2 {v6.4h, v7.4h}, [%1], %5 \n" // load row 2 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop -// filter first 2x2 group to produce 1st and 4th dest pixels -// 9 3 -// 3 1 + // filter first 2x2 group to produce 1st and 4th dest pixels + // 9 3 + // 3 1 "umull v8.4s, v0.4h, v22.4h \n" "umlal v8.4s, v1.4h, v21.4h \n" "umlal v8.4s, v2.4h, v21.4h \n" "umlal v8.4s, v3.4h, v20.4h \n" -// filter first 2x2 group to produce 2nd and 5th dest pixel -// 3 9 -// 1 3 + // filter first 2x2 group to produce 2nd and 5th dest pixel + // 3 9 + // 1 3 "umull v9.4s, v0.4h, v21.4h \n" "umlal v9.4s, v1.4h, v22.4h \n" "umlal v9.4s, v2.4h, v20.4h \n" "umlal v9.4s, v3.4h, v21.4h \n" -// filter second 2x2 group to produce 3rd and 6th dest pixels -// 9 3 -// 3 1 + // filter second 2x2 group to produce 3rd and 6th dest pixels + // 9 3 + // 3 1 "umull v10.4s, v4.4h, v22.4h \n" "umlal v10.4s, v5.4h, v21.4h \n" "umlal v10.4s, v6.4h, v21.4h \n" "umlal v10.4s, v7.4h, v20.4h \n" -// filter second 2x2 group to produce 4th and 7th dest pixel -// 3 9 -// 1 3 + // filter second 2x2 group to produce 4th and 7th dest pixel + // 3 9 + // 1 3 "umull v11.4s, v4.4h, v21.4h \n" "umlal v11.4s, v5.4h, v22.4h \n" "umlal v11.4s, v6.4h, v20.4h \n" @@ -1094,12 +1095,11 @@ void ScaleRowUp2_16_NEON(const uint16* src_ptr, : "r"(2LL), // %4 "r"(14LL) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v10", "v11", "v20", "v21", "v22" // Clobber List + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11", "v20", "v21", "v22" // Clobber List ); } - #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index c74b6b85a..355c6d08a 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -450,7 +450,6 @@ TEST_F(LibYUVScaleTest, TestScaleOdd) { } #endif // HAS_SCALEROWDOWN2_SSSE3 - #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) extern "C" void ScaleRowUp2_16_NEON(const uint16* src_ptr, @@ -470,16 +469,10 @@ TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { for (int i = 0; i < 640 * 2 + 1; ++i) { orig_pixels[i] = i; } - ScaleRowUp2_16_NEON(&orig_pixels[0], - 640, - &dst_pixels_c[0], - 1280); + ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ScaleRowUp2_16_NEON(&orig_pixels[0], - 640, - &dst_pixels_opt[0], - 1280); + ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); } for (int i = 0; i < 1280; ++i) { @@ -507,29 +500,17 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { for (int i = 0; i < 2560 * 2; ++i) { orig_pixels[i] = i; } - ScaleRowDown2Box_16_C(&orig_pixels[0], - 2560, - &dst_pixels_c[0], - 1280); + ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { - ScaleRowDown2Box_16_NEON(&orig_pixels[0], - 2560, - &dst_pixels_opt[0], - 1280); + ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280); } else { - ScaleRowDown2Box_16_C(&orig_pixels[0], - 2560, - &dst_pixels_opt[0], - 1280); + ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280); } #else - ScaleRowDown2Box_16_C(&orig_pixels[0], - 2560, - &dst_pixels_opt[0], - 1280); + ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280); #endif }