ARGBToJ444 and RAWToJ444 NEON

- Pass JPEG matrix to ARGBToUV444MatrixRow_NEON
- Remove NEON unsigned constants in favor of DOTPROD signed constants

Samsung S23:
Was C for UV
  ARGBToJ444_Opt (320 ms)
  RAWToJ444_Opt (411 ms)
Now I8MM
  ARGBToJ444_Opt (196 ms)
  RAWToJ444_Opt (301 ms)
NEON
  ARGBToJ444_Opt (505 ms)
  RAWToJ444_Opt (596 ms)

32 bit ARM NEON
  ARGBToJ444_Opt (1135 ms)
  RAWToJ444_Opt (1546 ms)

Profile of RAWToJ444
  37.72%  ARGBToUVJ444Row_NEON_I8MM
  34.48%  RAWToARGBRow_NEON
  14.65%  ARGBToYJRow_NEON_DotProd

Bug: 390247964
Change-Id: Ia26240bee974a0baf502548f2fc896b193c3006c
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6220890
Reviewed-by: Ben Weiss <bweiss@google.com>
This commit is contained in:
Frank Barchard 2025-01-31 02:56:59 -08:00
parent 1c938f342d
commit 96f98f6915
4 changed files with 97 additions and 38 deletions

View File

@ -395,6 +395,7 @@ extern "C" {
#define HAS_ARGBTORGB565DITHERROW_NEON
#define HAS_ARGBTORGB565ROW_NEON
#define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVJ444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
@ -537,6 +538,7 @@ extern "C" {
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
#define HAS_ARGBTOUV444ROW_NEON_I8MM
#define HAS_ARGBTOUVJ444ROW_NEON_I8MM
#endif
// The following are available on AArch64 SVE platforms:
@ -1858,6 +1860,14 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -2369,6 +2379,14 @@ void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJ444Row_Any_NEON_I8MM(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,

View File

@ -2155,11 +2155,13 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
#endif
#ifdef HAS_YUY2TOUV422ROW_NEON
ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
ANY12(ARGBToUVJ444Row_Any_NEON, ARGBToUVJ444Row_NEON, 0, 4, 0, 7)
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOUV444ROW_NEON_I8MM
ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7)
ANY12(ARGBToUVJ444Row_Any_NEON_I8MM, ARGBToUVJ444Row_NEON_I8MM, 0, 4, 0, 7)
#endif
#ifdef HAS_YUY2TOUV422ROW_MSA
ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)

View File

@ -1822,8 +1822,8 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
}
struct RgbUVConstants {
uint8_t kRGBToU[4];
uint8_t kRGBToV[4];
int8_t kRGBToU[4];
int8_t kRGBToV[4];
};
// 8x1 pixels.
@ -1847,12 +1847,12 @@ static void ARGBToUV444MatrixRow_NEON(
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
"vmlsl.u8 q2, d1, d25 \n" // G
"vmlsl.u8 q2, d2, d26 \n" // R
"vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R
"vmull.u8 q3, d2, d24 \n" // R
"vmlsl.u8 q3, d1, d28 \n" // G
"vmlsl.u8 q3, d0, d27 \n" // B
"vmlal.u8 q3, d1, d28 \n" // G
"vmlal.u8 q3, d0, d27 \n" // B
"vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned
"vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned
@ -1871,14 +1871,14 @@ static void ARGBToUV444MatrixRow_NEON(
// RGB to bt601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = 74
// UR -0.2969 coefficient = 38
// VB -0.1406 coefficient = 18
// VG -0.7344 coefficient = 94
// VR 0.875 coefficient = 112 (ignored)
// UG -0.5781 coefficient = -74
// UR -0.2969 coefficient = -38
// VB -0.1406 coefficient = -18
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
{18, 94, 112, 0}};
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, -74, -38, 0},
{-18, -94, 112, 0}};
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
@ -1888,6 +1888,26 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
&kRgb24I601UVConstants);
}
// RGB to JPEG coefficients
// UB 0.500 coefficient = 127
// UG -0.33126 coefficient = -84
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -20
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 127
static const struct RgbUVConstants kRgb24JPEGUVConstants = {
{127, -84, -43, 0},
{-20, -107, 127, 0}};
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kRgb24JPEGUVConstants);
}
// clang-format off
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \

View File

@ -2710,12 +2710,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
);
}
struct RgbUVConstantsU8 {
uint8_t kRGBToU[4];
uint8_t kRGBToV[4];
};
struct RgbUVConstantsI8 {
struct RgbUVConstants {
int8_t kRGBToU[4];
int8_t kRGBToV[4];
};
@ -2726,7 +2721,7 @@ static void ARGBToUV444MatrixRow_NEON(
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstantsU8* rgbuvconstants) {
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"ldr d0, [%4] \n" // load rgbuvconstants
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
@ -2740,13 +2735,13 @@ static void ARGBToUV444MatrixRow_NEON(
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R
"umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R
"prfm pldl1keep, [%0, 448] \n"
"umull v3.8h, v2.8b, v24.8b \n" // R
"umlsl v3.8h, v1.8b, v28.8b \n" // G
"umlsl v3.8h, v0.8b, v27.8b \n" // B
"umlal v3.8h, v1.8b, v28.8b \n" // G
"umlal v3.8h, v0.8b, v27.8b \n" // B
"addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned
"addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned
@ -2768,7 +2763,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstantsI8* rgbuvconstants) {
const struct RgbUVConstants* rgbuvconstants) {
asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
"movi v29.16b, #0x80 \n" // 128.5
"1: \n"
@ -2801,25 +2796,21 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
// RGB to bt601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = 74
// UR -0.2969 coefficient = 38
// VB -0.1406 coefficient = 18
// VG -0.7344 coefficient = 94
// VR 0.875 coefficient = 112 (ignored)
// UG -0.5781 coefficient = -74
// UR -0.2969 coefficient = -38
// VB -0.1406 coefficient = -18
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = {
{112, 74, 38, 0},
{18, 94, 112, 0}};
static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = {
{112, -74, -38, 0},
{-18, -94, 112, 0}};
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, -74, -38, 0},
{-18, -94, 112, 0}};
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kRgb24I601UVConstantsU8);
&kRgb24I601UVConstants);
}
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2827,7 +2818,35 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kRgb24I601UVConstantsI8);
&kRgb24I601UVConstants);
}
// RGB to JPEG coefficients
// UB 0.500 coefficient = 127
// UG -0.33126 coefficient = -84
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -20
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 127
static const struct RgbUVConstants kRgb24JPEGUVConstants = {
{127, -84, -43, 0},
{-20, -107, 127, 0}};
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kRgb24JPEGUVConstants);
}
void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kRgb24JPEGUVConstants);
}
#define RGBTOUV_SETUP_REG \