diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ef5b952f4..3ed94cc8b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -428,6 +428,12 @@ extern "C" { #define HAS_RGB565TOUVROW_MSA #define HAS_RGB24TOUVROW_MSA #define HAS_RAWTOUVROW_MSA +#define HAS_NV12TOARGBROW_MSA +#define HAS_NV12TORGB565ROW_MSA +#define HAS_NV21TOARGBROW_MSA +#define HAS_SOBELROW_MSA +#define HAS_SOBELTOPLANEROW_MSA +#define HAS_SOBELXYROW_MSA #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) @@ -807,6 +813,21 @@ void I422ToARGB1555Row_MSA(const uint8* src_y, uint8* dst_argb1555, const struct YuvConstants* yuvconstants, int width); +void NV12ToARGBRow_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_MSA(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width); @@ -2379,6 +2400,21 @@ void I422ToARGB1555Row_Any_MSA(const uint8* src_y, uint8* dst_argb1555, const struct YuvConstants* yuvconstants, int width); +void NV12ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV12ToRGB565Row_Any_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void NV21ToARGBRow_Any_MSA(const uint8* src_y, + const uint8* src_vu, + uint8* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width); void YUY2ToUVRow_AVX2(const uint8* src_yuy2, @@ -2868,6 +2904,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, @@ -2880,6 +2920,10 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width); +void SobelToPlaneRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, @@ -2892,6 +2936,10 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelXYRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, @@ -2900,6 +2948,10 @@ void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelRow_Any_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, @@ -2908,6 +2960,10 @@ void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width); +void SobelToPlaneRow_Any_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width); void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, @@ -2916,6 +2972,10 @@ void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelXYRow_Any_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width); void ARGBPolynomialRow_C(const uint8* src_argb, uint8* dst_argb, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index ccb3766c5..f7d0765e0 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1326,6 +1326,14 @@ int NV12ToARGB(const uint8* src_y, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); @@ -1385,6 +1393,14 @@ int NV21ToARGB(const uint8* src_y, } } #endif +#if defined(HAS_NV21TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV21ToARGBRow = NV21ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width); @@ -1450,6 +1466,14 @@ int M420ToARGB(const uint8* src_m420, } } #endif +#if defined(HAS_NV12TOARGBROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToARGBRow = NV12ToARGBRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_MSA; + } + } +#endif for (y = 0; y < height - 1; y += 2) { NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 9089bf2e5..04172d9ac 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1443,6 +1443,14 @@ int NV12ToRGB565(const uint8* src_y, } } #endif +#if defined(HAS_NV12TORGB565ROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; + if (IS_ALIGNED(width, 8)) { + NV12ToRGB565Row = NV12ToRGB565Row_MSA; + } + } +#endif for (y = 0; y < height; ++y) { NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width); @@ -2650,6 +2658,14 @@ int ARGBSobel(const uint8* src_argb, SobelRow = SobelRow_NEON; } } +#endif +#if defined(HAS_SOBELROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelRow = SobelRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); @@ -2680,6 +2696,14 @@ int ARGBSobelToPlane(const uint8* src_argb, SobelToPlaneRow = SobelToPlaneRow_NEON; } } +#endif +#if defined(HAS_SOBELTOPLANEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelToPlaneRow = SobelToPlaneRow_Any_MSA; + if (IS_ALIGNED(width, 32)) { + SobelToPlaneRow = SobelToPlaneRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, height, SobelToPlaneRow); @@ -2711,6 +2735,14 @@ int ARGBSobelXY(const uint8* src_argb, SobelXYRow = SobelXYRow_NEON; } } +#endif +#if defined(HAS_SOBELXYROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + SobelXYRow = SobelXYRow_Any_MSA; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_MSA; + } + } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); diff --git a/source/row_any.cc b/source/row_any.cc index c96a719aa..2af8d1e7c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -255,18 +255,27 @@ ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELROW_MSA +ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) +#endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif +#ifdef HAS_SOBELTOPLANEROW_MSA +ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) +#endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif +#ifdef HAS_SOBELXYROW_MSA +ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) +#endif #undef ANY21 // Any 2 planes to 1 with yuvconstants @@ -300,6 +309,9 @@ ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #ifdef HAS_NV12TOARGBROW_DSPR2 ANY21C(NV12ToARGBRow_Any_DSPR2, NV12ToARGBRow_DSPR2, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV12TOARGBROW_MSA +ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif @@ -309,6 +321,9 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #ifdef HAS_NV21TOARGBROW_NEON ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif +#ifdef HAS_NV21TOARGBROW_MSA +ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) +#endif #ifdef HAS_NV12TORGB565ROW_SSSE3 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif @@ -318,6 +333,9 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #ifdef HAS_NV12TORGB565ROW_NEON ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) #endif +#ifdef HAS_NV12TORGB565ROW_MSA +ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) +#endif #undef ANY21C // Any 1 to 1. diff --git a/source/row_msa.cc b/source/row_msa.cc index 1e174fd66..c5c0e98c5 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -47,65 +47,66 @@ extern "C" { } // Convert 8 pixels of YUV 420 to RGB. -#define YUVTORGB(in_y, in_u, in_v, ub, vr, ug, vg, bb, bg, br, yg, out_b, \ - out_g, out_r) \ - { \ - v8i16 vec0_m; \ - v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ - v4i32 reg5_m, reg6_m, reg7_m, reg8_m, reg9_m; \ - v4i32 max_val_m = __msa_ldi_w(255); \ - v8i16 zero_m = {0}; \ - \ - in_u = (v16u8)__msa_ilvr_b((v16i8)in_u, (v16i8)in_u); \ - in_v = (v16u8)__msa_ilvr_b((v16i8)in_v, (v16i8)in_v); \ - vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ - reg0_m = (v4i32)__msa_ilvr_h(zero_m, vec0_m); \ - reg1_m = (v4i32)__msa_ilvl_h(zero_m, vec0_m); \ - reg0_m *= vec_yg; \ - reg1_m *= vec_yg; \ - reg0_m = __msa_srai_w(reg0_m, 16); \ - reg1_m = __msa_srai_w(reg1_m, 16); \ - reg4_m = reg0_m + br; \ - reg5_m = reg1_m + br; \ - reg2_m = reg0_m + bg; \ - reg3_m = reg1_m + bg; \ - reg0_m += bb; \ - reg1_m += bb; \ - vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_u); \ - reg6_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ - reg7_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ - vec0_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_v); \ - reg8_m = (v4i32)__msa_ilvr_h(zero_m, (v8i16)vec0_m); \ - reg9_m = (v4i32)__msa_ilvl_h(zero_m, (v8i16)vec0_m); \ - reg0_m -= reg6_m * ub; \ - reg1_m -= reg7_m * ub; \ - reg2_m -= reg6_m * ug; \ - reg3_m -= reg7_m * ug; \ - reg4_m -= reg8_m * vr; \ - reg5_m -= reg9_m * vr; \ - reg2_m -= reg8_m * vg; \ - reg3_m -= reg9_m * vg; \ - reg0_m = __msa_srai_w(reg0_m, 6); \ - reg1_m = __msa_srai_w(reg1_m, 6); \ - reg2_m = __msa_srai_w(reg2_m, 6); \ - reg3_m = __msa_srai_w(reg3_m, 6); \ - reg4_m = __msa_srai_w(reg4_m, 6); \ - reg5_m = __msa_srai_w(reg5_m, 6); \ - reg0_m = __msa_maxi_s_w(reg0_m, 0); \ - reg1_m = __msa_maxi_s_w(reg1_m, 0); \ - reg2_m = __msa_maxi_s_w(reg2_m, 0); \ - reg3_m = __msa_maxi_s_w(reg3_m, 0); \ - reg4_m = __msa_maxi_s_w(reg4_m, 0); \ - reg5_m = __msa_maxi_s_w(reg5_m, 0); \ - reg0_m = __msa_min_s_w(reg0_m, max_val_m); \ - reg1_m = __msa_min_s_w(reg1_m, max_val_m); \ - reg2_m = __msa_min_s_w(reg2_m, max_val_m); \ - reg3_m = __msa_min_s_w(reg3_m, max_val_m); \ - reg4_m = __msa_min_s_w(reg4_m, max_val_m); \ - reg5_m = __msa_min_s_w(reg5_m, max_val_m); \ - out_b = __msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ - out_g = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ - out_r = __msa_pckev_h((v8i16)reg5_m, (v8i16)reg4_m); \ +#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ + { \ + v8i16 vec0_m, vec1_m; \ + v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ + v4i32 reg5_m, reg6_m, reg7_m; \ + v4i32 max = __msa_ldi_w(255); \ + v16i8 zero = {0}; \ + \ + vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ + vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in_uv); \ + reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0_m); \ + reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0_m); \ + reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1_m); \ + reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1_m); \ + reg0_m *= yg; \ + reg1_m *= yg; \ + reg2_m *= ubvr; \ + reg3_m *= ubvr; \ + reg0_m = __msa_srai_w(reg0_m, 16); \ + reg1_m = __msa_srai_w(reg1_m, 16); \ + reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ + reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ + reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ + reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ + reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ + reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ + reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ + reg5_m = reg0_m - reg5_m; \ + reg6_m = reg1_m - reg6_m; \ + reg2_m = reg0_m - reg2_m; \ + reg3_m = reg1_m - reg3_m; \ + reg7_m = reg0_m - reg7_m; \ + reg4_m = reg1_m - reg4_m; \ + reg5_m += bb; \ + reg6_m += bb; \ + reg7_m += bg; \ + reg4_m += bg; \ + reg2_m += br; \ + reg3_m += br; \ + reg5_m = __msa_srai_w(reg5_m, 6); \ + reg6_m = __msa_srai_w(reg6_m, 6); \ + reg7_m = __msa_srai_w(reg7_m, 6); \ + reg4_m = __msa_srai_w(reg4_m, 6); \ + reg2_m = __msa_srai_w(reg2_m, 6); \ + reg3_m = __msa_srai_w(reg3_m, 6); \ + reg5_m = __msa_maxi_s_w(reg5_m, 0); \ + reg6_m = __msa_maxi_s_w(reg6_m, 0); \ + reg7_m = __msa_maxi_s_w(reg7_m, 0); \ + reg4_m = __msa_maxi_s_w(reg4_m, 0); \ + reg2_m = __msa_maxi_s_w(reg2_m, 0); \ + reg3_m = __msa_maxi_s_w(reg3_m, 0); \ + reg5_m = __msa_min_s_w(max, reg5_m); \ + reg6_m = __msa_min_s_w(max, reg6_m); \ + reg7_m = __msa_min_s_w(max, reg7_m); \ + reg4_m = __msa_min_s_w(max, reg4_m); \ + reg2_m = __msa_min_s_w(max, reg2_m); \ + reg3_m = __msa_min_s_w(max, reg3_m); \ + out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ + out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ + out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ } // Pack and Store 8 ARGB values. @@ -212,15 +213,19 @@ void I422ToARGBRow_MSA(const uint8* src_y, v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; v16u8 const_255 = (v16u8)__msa_ldi_b(255); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec1, vec2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, const_255, rgb_buf); src_y += 8; src_u += 4; @@ -239,15 +244,19 @@ void I422ToRGBARow_MSA(const uint8* src_y, v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; v16u8 const_255 = (v16u8)__msa_ldi_b(255); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec1, vec2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); STOREARGB(const_255, vec0, vec1, vec2, rgb_buf); src_y += 8; src_u += 4; @@ -268,17 +277,21 @@ void I422AlphaToARGBRow_MSA(const uint8* src_y, v16u8 src0, src1, src2, src3; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; v4i32 zero = {0}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { data_a = LD(src_a); READYUV422(src_y, src_u, src_v, src0, src1, src2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec1, vec2); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); STOREARGB(vec0, vec1, vec2, src3, rgb_buf); src_y += 8; @@ -297,9 +310,10 @@ void I422ToRGB24Row_MSA(const uint8* src_y, int32 width) { int x; int64 data_u, data_v; - v16u8 src0, src1, src2, src3, src4, src5, dst0, dst1, dst2; + v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v8i16 vec0, vec1, vec2, vec3, vec4, vec5; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; v16u8 reg0, reg1, reg2, reg3; v2i64 zero = {0}; v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; @@ -309,6 +323,8 @@ void I422ToRGB24Row_MSA(const uint8* src_y, YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); @@ -316,13 +332,13 @@ void I422ToRGB24Row_MSA(const uint8* src_y, data_v = LD(src_v); src1 = (v16u8)__msa_insert_d(zero, 0, data_u); src2 = (v16u8)__msa_insert_d(zero, 0, data_v); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); - src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 4); - src5 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec1, vec2); - YUVTORGB(src3, src4, src5, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec3, vec4, vec5); + src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec3, vec4, vec5); reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); @@ -350,14 +366,18 @@ void I422ToRGB565Row_MSA(const uint8* src_y, v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec2, vec1); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec2, vec1); vec0 = __msa_srai_h(vec0, 3); vec1 = __msa_srai_h(vec1, 3); vec2 = __msa_srai_h(vec2, 2); @@ -385,15 +405,19 @@ void I422ToARGB4444Row_MSA(const uint8* src_y, v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec1, vec2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); reg0 = (v8u16)__msa_srai_h(vec0, 4); reg1 = (v8u16)__msa_srai_h(vec1, 4); reg2 = (v8u16)__msa_srai_h(vec2, 4); @@ -421,15 +445,19 @@ void I422ToARGB1555Row_MSA(const uint8* src_y, v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); - YUVTORGB(src0, src1, src2, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, - vec_br, vec_yg, vec0, vec1, vec2); + src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); reg0 = (v8u16)__msa_srai_h(vec0, 3); reg1 = (v8u16)__msa_srai_h(vec1, 3); reg2 = (v8u16)__msa_srai_h(vec2, 3); @@ -2023,6 +2051,195 @@ void RAWToUVRow_MSA(const uint8* src_rgb0, } } +void NV12ToARGBRow_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64 val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_uv += 8; + rgb_buf += 32; + } +} + +void NV12ToRGB565Row_MSA(const uint8* src_y, + const uint8* src_uv, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64 val0, val1; + v16u8 src0, src1, dst0; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 zero = {0}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_uv); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + vec0 = vec0 >> 3; + vec1 = (vec1 >> 2) << 5; + vec2 = (vec2 >> 3) << 11; + dst0 = (v16u8)(vec0 | vec1 | vec2); + ST_UB(dst0, rgb_buf); + src_y += 8; + src_uv += 8; + rgb_buf += 16; + } +} + +void NV21ToARGBRow_MSA(const uint8* src_y, + const uint8* src_vu, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + uint64 val0, val1; + v16u8 src0, src1, res0, res1, dst0, dst1; + v8i16 vec0, vec1, vec2; + v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; + v4i32 vec_ubvr, vec_ugvg; + v16u8 const_255 = (v16u8)__msa_ldi_b(255); + v16u8 zero = {0}; + v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; + + YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, + vec_br, vec_yg); + vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); + vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); + + for (x = 0; x < width; x += 8) { + val0 = LD(src_y); + val1 = LD(src_vu); + src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); + src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); + src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); + YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, + vec0, vec1, vec2); + res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); + res1 = (v16u8)__msa_ilvev_b((v16i8)const_255, (v16i8)vec1); + dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); + dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); + ST_UB2(dst0, dst1, rgb_buf, 16); + src_y += 8; + src_vu += 8; + rgb_buf += 32; + } +} + +void SobelRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; + v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; + v16i8 const_0x4 = __msa_ldi_b(0x4); + v16i8 mask1 = mask0 + const_0x4; + v16i8 mask2 = mask1 + const_0x4; + v16i8 mask3 = mask2 + const_0x4; + v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)const_0xFF, (v16i8)vec0); + dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)const_0xFF, (v16i8)vec0); + dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)const_0xFF, (v16i8)vec0); + dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)const_0xFF, (v16i8)vec0); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + +void SobelToPlaneRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_y, + int width) { + int x; + v16u8 src0, src1, src2, src3, dst0, dst1; + + for (x = 0; x < width; x += 32) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); + src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); + dst0 = __msa_adds_u_b(src0, src2); + dst1 = __msa_adds_u_b(src1, src3); + ST_UB2(dst0, dst1, dst_y, 16); + src_sobelx += 32; + src_sobely += 32; + dst_y += 32; + } +} + +void SobelXYRow_MSA(const uint8* src_sobelx, + const uint8* src_sobely, + uint8* dst_argb, + int width) { + int x; + v16u8 src0, src1, vec0, vec1, vec2; + v16u8 reg0, reg1, dst0, dst1, dst2, dst3; + v16u8 const_0xFF = (v16u8)__msa_ldi_b(0xFF); + + for (x = 0; x < width; x += 16) { + src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); + src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); + vec0 = __msa_adds_u_b(src0, src1); + vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); + vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); + reg0 = (v16u8)__msa_ilvr_b((v16i8)const_0xFF, (v16i8)vec0); + reg1 = (v16u8)__msa_ilvl_b((v16i8)const_0xFF, (v16i8)vec0); + dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); + dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); + dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); + dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); + ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); + src_sobelx += 16; + src_sobely += 16; + dst_argb += 64; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv