mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
[AArch64] Add I8MM implementation of ARGBToUV444Row
We cannot use the standard dot-product instructions since the coefficients multiplication results are both added and subtracted, but I8MM supports mixed-sign dot products which work well here. We need to add an additional variant of the coefficient structs since we need negative constants for the elements that were previously subtracted. Reduction in runtimes observed compared to the previous Neon implementation: Cortex-A510: -37.3% Cortex-A520: -31.1% Cortex-A715: -37.1% Cortex-A720: -37.0% Cortex-X2: -62.1% Cortex-X3: -62.2% Cortex-X4: -40.4% Bug: libyuv:977 Change-Id: Idc3d9a6408c30e1bce3816a1ed926ecd76792236 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5712928 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
302d29d1a8
commit
a4ccf9940e
@ -578,6 +578,7 @@ extern "C" {
|
|||||||
#define HAS_RGBATOYROW_NEON_DOTPROD
|
#define HAS_RGBATOYROW_NEON_DOTPROD
|
||||||
|
|
||||||
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
|
#define HAS_ARGBCOLORMATRIXROW_NEON_I8MM
|
||||||
|
#define HAS_ARGBTOUV444ROW_NEON_I8MM
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following are available on AArch64 SVE platforms:
|
// The following are available on AArch64 SVE platforms:
|
||||||
@ -1611,6 +1612,10 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
void ARGBToUVRow_NEON(const uint8_t* src_argb,
|
||||||
int src_stride_argb,
|
int src_stride_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
@ -2118,6 +2123,10 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBToUV444Row_Any_NEON_I8MM(const uint8_t* src_ptr,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
|
void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
|
|||||||
@ -68,6 +68,14 @@ int ARGBToI444(const uint8_t* src_argb,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ARGBTOUV444ROW_NEON_I8MM)
|
||||||
|
if (TestCpuFlag(kCpuHasNeonI8MM)) {
|
||||||
|
ARGBToUV444Row = ARGBToUV444Row_Any_NEON_I8MM;
|
||||||
|
if (IS_ALIGNED(width, 8)) {
|
||||||
|
ARGBToUV444Row = ARGBToUV444Row_NEON_I8MM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTOUV444ROW_MSA)
|
#if defined(HAS_ARGBTOUV444ROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
|
ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
|
||||||
|
|||||||
@ -2116,6 +2116,9 @@ ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
|
|||||||
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
|
ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
|
||||||
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
|
ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAS_ARGBTOUV444ROW_NEON_I8MM
|
||||||
|
ANY12(ARGBToUV444Row_Any_NEON_I8MM, ARGBToUV444Row_NEON_I8MM, 0, 4, 0, 7)
|
||||||
|
#endif
|
||||||
#ifdef HAS_YUY2TOUV422ROW_MSA
|
#ifdef HAS_YUY2TOUV422ROW_MSA
|
||||||
ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
|
ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
|
||||||
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
|
ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
|
||||||
|
|||||||
@ -2711,17 +2711,22 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct RgbUVConstants {
|
struct RgbUVConstantsU8 {
|
||||||
uint8_t kRGBToU[4];
|
uint8_t kRGBToU[4];
|
||||||
uint8_t kRGBToV[4];
|
uint8_t kRGBToV[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct RgbUVConstantsI8 {
|
||||||
|
int8_t kRGBToU[4];
|
||||||
|
int8_t kRGBToV[4];
|
||||||
|
};
|
||||||
|
|
||||||
// 8x1 pixels.
|
// 8x1 pixels.
|
||||||
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbUVConstants* rgbuvconstants) {
|
const struct RgbUVConstantsU8* rgbuvconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"ldr d0, [%4] \n" // load rgbuvconstants
|
"ldr d0, [%4] \n" // load rgbuvconstants
|
||||||
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
|
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
|
||||||
@ -2758,6 +2763,42 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
|||||||
"v27", "v28", "v29");
|
"v27", "v28", "v29");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||||
|
const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width,
|
||||||
|
const struct RgbUVConstantsI8* rgbuvconstants) {
|
||||||
|
asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
|
||||||
|
"movi v29.16b, #0x80 \n" // 128.5
|
||||||
|
"1: \n"
|
||||||
|
"ldp q0, q1, [%[src]], #32 \n"
|
||||||
|
"movi v2.4s, #0 \n"
|
||||||
|
"movi v3.4s, #0 \n"
|
||||||
|
"movi v4.4s, #0 \n"
|
||||||
|
"movi v5.4s, #0 \n"
|
||||||
|
"usdot v2.4s, v0.16b, v16.16b \n"
|
||||||
|
"usdot v3.4s, v1.16b, v16.16b \n"
|
||||||
|
"usdot v4.4s, v0.16b, v17.16b \n"
|
||||||
|
"usdot v5.4s, v1.16b, v17.16b \n"
|
||||||
|
"prfm pldl1keep, [%[src], 448] \n"
|
||||||
|
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
|
||||||
|
"uzp1 v0.8h, v2.8h, v3.8h \n"
|
||||||
|
"uzp1 v1.8h, v4.8h, v5.8h \n"
|
||||||
|
"addhn v0.8b, v0.8h, v29.8h \n" // +128 -> unsigned
|
||||||
|
"addhn v1.8b, v1.8h, v29.8h \n" // +128 -> unsigned
|
||||||
|
"str d0, [%[dst_u]], #8 \n" // store 8 pixels U.
|
||||||
|
"str d1, [%[dst_v]], #8 \n" // store 8 pixels V.
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: [src] "+r"(src_argb), // %[src]
|
||||||
|
[dst_u] "+r"(dst_u), // %[dst_u]
|
||||||
|
[dst_v] "+r"(dst_v), // %[dst_v]
|
||||||
|
[width] "+r"(width) // %[width]
|
||||||
|
: [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants]
|
||||||
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
|
||||||
|
"v29");
|
||||||
|
}
|
||||||
|
|
||||||
// RGB to bt601 coefficients
|
// RGB to bt601 coefficients
|
||||||
// UB 0.875 coefficient = 112
|
// UB 0.875 coefficient = 112
|
||||||
// UG -0.5781 coefficient = 74
|
// UG -0.5781 coefficient = 74
|
||||||
@ -2766,15 +2807,27 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
|||||||
// VG -0.7344 coefficient = 94
|
// VG -0.7344 coefficient = 94
|
||||||
// VR 0.875 coefficient = 112 (ignored)
|
// VR 0.875 coefficient = 112 (ignored)
|
||||||
|
|
||||||
static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
|
static const struct RgbUVConstantsU8 kRgb24I601UVConstantsU8 = {
|
||||||
|
{112, 74, 38, 0},
|
||||||
{18, 94, 112, 0}};
|
{18, 94, 112, 0}};
|
||||||
|
static const struct RgbUVConstantsI8 kRgb24I601UVConstantsI8 = {
|
||||||
|
{112, -74, -38, 0},
|
||||||
|
{-18, -94, 112, 0}};
|
||||||
|
|
||||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||||
&kRgb24I601UVConstants);
|
&kRgb24I601UVConstantsU8);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
|
||||||
|
&kRgb24I601UVConstantsI8);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define RGBTOUV_SETUP_REG \
|
#define RGBTOUV_SETUP_REG \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user