diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d080bb2e8..5e0d2ae9f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -578,6 +578,7 @@ extern "C" { #define HAS_RGBATOYROW_NEON_DOTPROD #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM +#define HAS_ARGBTOUV444ROW_NEON_I8MM #endif // The following are available on AArch64 SVE platforms: @@ -1611,6 +1612,10 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2118,6 +2123,10 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index a766beaa6..a27f9d922 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM) + if (TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM; + } + } +#endif #if defined(HAS_ARGBTOUV444ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUV444Row = ARGBToUV444Row_Any_MSA; diff --git a/source/row_any.cc b/source/row_any.cc index 50485bcdf..2118ad500 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2116,6 +2116,9 @@ ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif +#ifdef HAS_ARGBTOUV444ROW_NEON_I8MM +ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7) +#endif #ifdef HAS_YUY2TOUV422ROW_MSA ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 062e7fb19..e5eb353b7 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2711,18 +2711,23 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -struct RgbUVConstants { +struct RgbUVConstantsU8 { uint8_t kRGBToU[4]; uint8_t kRGBToV[4]; }; +struct RgbUVConstantsI8 { + int8_t kRGBToU[4]; + int8_t kRGBToV[4]; +}; + // 8x1 pixels. void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width, - const struct RgbUVConstants* rgbuvconstants) { - asm volatile ( + const struct RgbUVConstantsU8* rgbuvconstants) { + asm volatile( "ldr d0, [%4] \n" // load rgbuvconstants "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient @@ -2758,6 +2763,42 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, "v27", "v28", "v29"); } +void ARGBToUV444MatrixRow_NEON_I8MM( + const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstantsI8* rgbuvconstants) { + asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" + "movi v29.16b, #0x80 \n" // 128.5 + "1: \n" + "ldp q0, q1, [%[src]], #32 \n" + "movi v2.4s, #0 \n" + "movi v3.4s, #0 \n" + "movi v4.4s, #0 \n" + "movi v5.4s, #0 \n" + "usdot v2.4s, v0.16b, v16.16b \n" + "usdot v3.4s, v1.16b, v16.16b \n" + "usdot v4.4s, v0.16b, v17.16b \n" + "usdot v5.4s, v1.16b, v17.16b \n" + "prfm pldl1keep, [%[src], 448] \n" + "subs %w[width], %w[width], #8 \n" // 8 processed per loop. + "uzp1 v0.8h, v2.8h, v3.8h \n" + "uzp1 v1.8h, v4.8h, v5.8h \n" + "addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned + "addhn v1.8b, v1.8h, v29.8h \n" // +128 -> unsigned + "str d0, [%[dst_u]], #8 \n" // store 8 pixels U. + "str d1, [%[dst_v]], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : [src] "+r"(src_argb), // %[src] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants] + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", + "v29"); +} + // RGB to bt601 coefficients // UB 0.875 coefficient = 112 // UG -0.5781 coefficient = 74 @@ -2766,15 +2807,27 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, // VG -0.7344 coefficient = 94 // VR 0.875 coefficient = 112 (ignored) -static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, - {18, 94, 112, 0}}; +static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = { + {112, 74, 38, 0}, + {18, 94, 112, 0}}; +static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = { + {112, -74, -38, 0}, + {-18, -94, 112, 0}}; void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstants); + &kRgb24I601UVConstantsU8); +} + +void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, + &kRgb24I601UVConstantsI8); } #define RGBTOUV_SETUP_REG \