From 96f98f6915885f84b895dddcf8a553dc50a35aad Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 31 Jan 2025 02:56:59 -0800 Subject: [PATCH] ARGBToJ444 and RAWToJ444 NEON - Pass JPEG matrix to ARGBToUV444MatrixRow_NEON - Remove NEON unsigned constants in favor of DOTPROD signed constants Samsung S23: Was C for UV ARGBToJ444_Opt (320 ms) RAWToJ444_Opt (411 ms) Now I8MM ARGBToJ444_Opt (196 ms) RAWToJ444_Opt (301 ms) NEON ARGBToJ444_Opt (505 ms) RAWToJ444_Opt (596 ms) 32 bit ARM NEON ARGBToJ444_Opt (1135 ms) RAWToJ444_Opt (1546 ms) Profile of RAWToJ444 37.72% ARGBToUVJ444Row_NEON_I8MM 34.48% RAWToARGBRow_NEON 14.65% ARGBToYJRow_NEON_DotProd Bug: 390247964 Change-Id: Ia26240bee974a0baf502548f2fc896b193c3006c Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6220890 Reviewed-by: Ben Weiss --- include/libyuv/row.h | 18 ++++++++++++ source/row_any.cc | 2 ++ source/row_neon.cc | 46 ++++++++++++++++++++--------- source/row_neon64.cc | 69 ++++++++++++++++++++++++++++---------------- 4 files changed, 97 insertions(+), 38 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5dead3052..9ee8af68f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -395,6 +395,7 @@ extern "C" { #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON @@ -537,6 +538,7 @@ extern "C" { #define HAS_ARGBCOLORMATRIXROW_NEON_I8MM #define HAS_ARGBTOUV444ROW_NEON_I8MM +#define HAS_ARGBTOUVJ444ROW_NEON_I8MM #endif // The following are available on AArch64 SVE platforms: @@ -1858,6 +1860,14 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2369,6 +2379,14 @@ void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); +void ARGBToUVJ444Row_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); +void ARGBToUVJ444Row_Any_NEON_I8MM(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, diff --git a/source/row_any.cc b/source/row_any.cc index e994d694e..c49ef50bd 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2155,11 +2155,13 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) +ANY12(ARGBToUVJ444Row_Any_NEON, ARGBToUVJ444Row_NEON, 0, 4, 0, 7) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif #ifdef HAS_ARGBTOUV444ROW_NEON_I8MM ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7) +ANY12(ARGBToUVJ444Row_Any_NEON_I8MM, ARGBToUVJ444Row_NEON_I8MM, 0, 4, 0, 7) #endif #ifdef HAS_YUY2TOUV422ROW_MSA ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) diff --git a/source/row_neon.cc b/source/row_neon.cc index 9b2b80d7b..d5e36cdef 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1822,8 +1822,8 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, } struct RgbUVConstants { - uint8_t kRGBToU[4]; - uint8_t kRGBToV[4]; + int8_t kRGBToU[4]; + int8_t kRGBToV[4]; }; // 8x1 pixels. @@ -1847,12 +1847,12 @@ static void ARGBToUV444MatrixRow_NEON( "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R + "vmlal.u8 q2, d1, d25 \n" // G + "vmlal.u8 q2, d2, d26 \n" // R "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B + "vmlal.u8 q3, d1, d28 \n" // G + "vmlal.u8 q3, d0, d27 \n" // B "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned @@ -1871,14 +1871,14 @@ static void ARGBToUV444MatrixRow_NEON( // RGB to bt601 coefficients // UB 0.875 coefficient = 112 -// UG -0.5781 coefficient = 74 -// UR -0.2969 coefficient = 38 -// VB -0.1406 coefficient = 18 -// VG -0.7344 coefficient = 94 -// VR 0.875 coefficient = 112 (ignored) +// UG -0.5781 coefficient = -74 +// UR -0.2969 coefficient = -38 +// VB -0.1406 coefficient = -18 +// VG -0.7344 coefficient = -94 +// VR 0.875 coefficient = 112 -static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0}, - {18, 94, 112, 0}}; +static const struct RgbUVConstants kRgb24I601UVConstants = {{112, -74, -38, 0}, + {-18, -94, 112, 0}}; void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, @@ -1888,6 +1888,26 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, &kRgb24I601UVConstants); } +// RGB to JPEG coefficients +// UB 0.500 coefficient = 127 +// UG -0.33126 coefficient = -84 +// UR -0.16874 coefficient = -43 +// VB -0.08131 coefficient = -20 +// VG -0.41869 coefficient = -107 +// VR 0.500 coefficient = 127 + +static const struct RgbUVConstants kRgb24JPEGUVConstants = { + {127, -84, -43, 0}, + {-20, -107, 127, 0}}; + +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24JPEGUVConstants); +} + // clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 97afa6828..8ef9d4975 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2710,12 +2710,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -struct RgbUVConstantsU8 { - uint8_t kRGBToU[4]; - uint8_t kRGBToV[4]; -}; - -struct RgbUVConstantsI8 { +struct RgbUVConstants { int8_t kRGBToU[4]; int8_t kRGBToV[4]; }; @@ -2726,7 +2721,7 @@ static void ARGBToUV444MatrixRow_NEON( uint8_t* dst_u, uint8_t* dst_v, int width, - const struct RgbUVConstantsU8* rgbuvconstants) { + const struct RgbUVConstants* rgbuvconstants) { asm volatile( "ldr d0, [%4] \n" // load rgbuvconstants "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient @@ -2740,13 +2735,13 @@ static void ARGBToUV444MatrixRow_NEON( "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B - "umlsl v4.8h, v1.8b, v25.8b \n" // G - "umlsl v4.8h, v2.8b, v26.8b \n" // R + "umlal v4.8h, v1.8b, v25.8b \n" // G + "umlal v4.8h, v2.8b, v26.8b \n" // R "prfm pldl1keep, [%0, 448] \n" "umull v3.8h, v2.8b, v24.8b \n" // R - "umlsl v3.8h, v1.8b, v28.8b \n" // G - "umlsl v3.8h, v0.8b, v27.8b \n" // B + "umlal v3.8h, v1.8b, v28.8b \n" // G + "umlal v3.8h, v0.8b, v27.8b \n" // B "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned @@ -2768,7 +2763,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( uint8_t* dst_u, uint8_t* dst_v, int width, - const struct RgbUVConstantsI8* rgbuvconstants) { + const struct RgbUVConstants* rgbuvconstants) { asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" "movi v29.16b, #0x80 \n" // 128.5 "1: \n" @@ -2801,25 +2796,21 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( // RGB to bt601 coefficients // UB 0.875 coefficient = 112 -// UG -0.5781 coefficient = 74 -// UR -0.2969 coefficient = 38 -// VB -0.1406 coefficient = 18 -// VG -0.7344 coefficient = 94 -// VR 0.875 coefficient = 112 (ignored) +// UG -0.5781 coefficient = -74 +// UR -0.2969 coefficient = -38 +// VB -0.1406 coefficient = -18 +// VG -0.7344 coefficient = -94 +// VR 0.875 coefficient = 112 -static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = { - {112, 74, 38, 0}, - {18, 94, 112, 0}}; -static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = { - {112, -74, -38, 0}, - {-18, -94, 112, 0}}; +static const struct RgbUVConstants kRgb24I601UVConstants = {{112, -74, -38, 0}, + {-18, -94, 112, 0}}; void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstantsU8); + &kRgb24I601UVConstants); } void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, @@ -2827,7 +2818,35 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_v, int width) { ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, - &kRgb24I601UVConstantsI8); + &kRgb24I601UVConstants); +} + +// RGB to JPEG coefficients +// UB 0.500 coefficient = 127 +// UG -0.33126 coefficient = -84 +// UR -0.16874 coefficient = -43 +// VB -0.08131 coefficient = -20 +// VG -0.41869 coefficient = -107 +// VR 0.500 coefficient = 127 + +static const struct RgbUVConstants kRgb24JPEGUVConstants = { + {127, -84, -43, 0}, + {-20, -107, 127, 0}}; + +void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, + &kRgb24JPEGUVConstants); +} + +void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, + &kRgb24JPEGUVConstants); } #define RGBTOUV_SETUP_REG \