[AArch64] Use Neon dot-product instructions in ARGBToYMatrixRow

Using the dot-product instructions here allows us to avoid needing LD4
for loading individual colour channels, which gives a big benefit on
some micro-architectures where such instructions perform significantly
worse than LD1. In addition the dot-product instructions have higher
throughput compared to the Neon

Observed reduction in runtimes for selected kernels moving from *_NEON
to *_NEON_DotProd:

     Kernel | Cortex-A55 | Cortex-A510 | Cortex-A76 | Cortex-X2
ABGRToYJRow |      -6.5% |      -22.5% |     -43.5% |    -71.2%
 ABGRToYRow |      -6.5% |      -22.5% |     -43.5% |    -68.3%
ARGBToYJRow |      -6.5% |      -22.5% |     -43.5% |    -68.1%
 ARGBToYRow |      -6.5% |      -22.5% |     -43.5% |    -68.1%
 BGRAToYRow |      -6.5% |      -22.5% |     -42.3% |    -68.4%
RGBAToYJRow |      -6.5% |      -22.5% |     -42.2% |    -73.7%
 RGBAToYRow |      -6.5% |      -22.5% |     -42.3% |    -64.9%

Bug: libyuv:977
Change-Id: If244190a7bdacf7e6e6b16af7e6853ee13ff6585
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5424737
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
George Steed 2024-03-15 18:22:03 +00:00 committed by Frank Barchard
parent 6406179063
commit f2e78e1304
5 changed files with 330 additions and 0 deletions

View File

@ -421,7 +421,9 @@ extern "C" {
#define HAS_ABGRTOUVJROW_NEON #define HAS_ABGRTOUVJROW_NEON
#define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOUVROW_NEON
#define HAS_ABGRTOYJROW_NEON #define HAS_ABGRTOYJROW_NEON
#define HAS_ABGRTOYJROW_NEON_DOTPROD
#define HAS_ABGRTOYROW_NEON #define HAS_ABGRTOYROW_NEON
#define HAS_ABGRTOYROW_NEON_DOTPROD
#define HAS_AR64TOARGBROW_NEON #define HAS_AR64TOARGBROW_NEON
#define HAS_ARGB1555TOARGBROW_NEON #define HAS_ARGB1555TOARGBROW_NEON
#define HAS_ARGB1555TOUVROW_NEON #define HAS_ARGB1555TOUVROW_NEON
@ -443,12 +445,15 @@ extern "C" {
#define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYJROW_NEON_DOTPROD
#define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYROW_NEON
#define HAS_ARGBTOYROW_NEON_DOTPROD
#define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOUVROW_NEON
#define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOVUROW_NEON
#define HAS_AYUVTOYROW_NEON #define HAS_AYUVTOYROW_NEON
#define HAS_BGRATOUVROW_NEON #define HAS_BGRATOUVROW_NEON
#define HAS_BGRATOYROW_NEON #define HAS_BGRATOYROW_NEON
#define HAS_BGRATOYROW_NEON_DOTPROD
#define HAS_BYTETOFLOATROW_NEON #define HAS_BYTETOFLOATROW_NEON
#define HAS_CONVERT16TO8ROW_NEON #define HAS_CONVERT16TO8ROW_NEON
#define HAS_COPYROW_NEON #define HAS_COPYROW_NEON
@ -512,7 +517,9 @@ extern "C" {
#define HAS_RGB565TOYROW_NEON #define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON #define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYJROW_NEON #define HAS_RGBATOYJROW_NEON
#define HAS_RGBATOYJROW_NEON_DOTPROD
#define HAS_RGBATOYROW_NEON #define HAS_RGBATOYROW_NEON
#define HAS_RGBATOYROW_NEON_DOTPROD
#define HAS_SETROW_NEON #define HAS_SETROW_NEON
#define HAS_SPLITARGBROW_NEON #define HAS_SPLITARGBROW_NEON
#define HAS_SPLITRGBROW_NEON #define HAS_SPLITRGBROW_NEON
@ -1409,6 +1416,18 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width); void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y,
int width);
void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_yj,
int width);
void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_yj,
int width);
void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_yj,
int width);
void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width); void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ABGRToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
@ -1632,6 +1651,15 @@ void RAWToUVRow_LASX(const uint8_t* src_raw,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
uint8_t* dst_y,
int width);
void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_y,
int width);
void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_y,
int width);
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
@ -1709,12 +1737,33 @@ void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

View File

@ -1904,6 +1904,14 @@ int ARGBToI420(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -2066,6 +2074,14 @@ int ARGBToI420Alpha(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -2229,6 +2245,14 @@ int BGRAToI420(const uint8_t* src_bgra,
} }
} }
#endif #endif
#if defined(HAS_BGRATOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
BGRAToYRow = BGRAToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
BGRAToYRow = BGRAToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_BGRATOUVROW_NEON) #if defined(HAS_BGRATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
BGRAToUVRow = BGRAToUVRow_Any_NEON; BGRAToUVRow = BGRAToUVRow_Any_NEON;
@ -2388,6 +2412,14 @@ int ABGRToI420(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_ABGRTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ABGRToYRow = ABGRToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_NEON) #if defined(HAS_ABGRTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ABGRToUVRow = ABGRToUVRow_Any_NEON; ABGRToUVRow = ABGRToUVRow_Any_NEON;
@ -2497,6 +2529,14 @@ int RGBAToI420(const uint8_t* src_rgba,
} }
} }
#endif #endif
#if defined(HAS_RGBATOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
RGBAToYRow = RGBAToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
RGBAToYRow = RGBAToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_RGBATOUVROW_NEON) #if defined(HAS_RGBATOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
RGBAToUVRow = RGBAToUVRow_Any_NEON; RGBAToUVRow = RGBAToUVRow_Any_NEON;

View File

@ -116,6 +116,14 @@ int ARGBToI444(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MSA) #if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToYRow = ARGBToYRow_Any_MSA;
@ -231,6 +239,14 @@ int ARGBToI422(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -331,6 +347,14 @@ int ARGBToNV12(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -553,6 +577,14 @@ int ARGBToNV21(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -752,6 +784,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_ABGRTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ABGRToYRow = ABGRToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_NEON) #if defined(HAS_ABGRTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ABGRToUVRow = ABGRToUVRow_Any_NEON; ABGRToUVRow = ABGRToUVRow_Any_NEON;
@ -940,6 +980,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_ABGRTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ABGRToYRow = ABGRToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ABGRToYRow = ABGRToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_NEON) #if defined(HAS_ABGRTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ABGRToUVRow = ABGRToUVRow_Any_NEON; ABGRToUVRow = ABGRToUVRow_Any_NEON;
@ -1133,6 +1181,14 @@ int ARGBToYUY2(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -1329,6 +1385,14 @@ int ARGBToUYVY(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_NEON) #if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON; ARGBToUVRow = ARGBToUVRow_Any_NEON;
@ -1501,6 +1565,14 @@ int ARGBToI400(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYRow = ARGBToYRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MSA) #if defined(HAS_ARGBTOYROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToYRow = ARGBToYRow_Any_MSA;
@ -2248,6 +2320,14 @@ int ARGBToJ420(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_NEON) #if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON; ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
@ -2416,6 +2496,14 @@ int ARGBToJ422(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_NEON) #if defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON; ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
@ -2522,6 +2610,14 @@ int ARGBToJ400(const uint8_t* src_argb,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYJRow = ARGBToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYJRow = ARGBToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_MSA) #if defined(HAS_ARGBTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
ARGBToYJRow = ARGBToYJRow_Any_MSA; ARGBToYJRow = ARGBToYJRow_Any_MSA;
@ -2593,6 +2689,14 @@ int RGBAToJ400(const uint8_t* src_rgba,
} }
} }
#endif #endif
#if defined(HAS_RGBATOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
RGBAToYJRow = RGBAToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
RGBAToYJRow = RGBAToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_RGBATOYJROW_MSA) #if defined(HAS_RGBATOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYJRow = RGBAToYJRow_Any_MSA; RGBAToYJRow = RGBAToYJRow_Any_MSA;
@ -2698,6 +2802,14 @@ int ABGRToJ420(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ABGRToYJRow = ABGRToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ABGRTOUVJROW_NEON) #if defined(HAS_ABGRTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ABGRToUVJRow = ABGRToUVJRow_Any_NEON; ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
@ -2828,6 +2940,14 @@ int ABGRToJ422(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ABGRToYJRow = ABGRToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ABGRTOUVJROW_NEON) #if defined(HAS_ABGRTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ABGRToUVJRow = ABGRToUVJRow_Any_NEON; ABGRToUVJRow = ABGRToUVJRow_Any_NEON;
@ -2930,6 +3050,14 @@ int ABGRToJ400(const uint8_t* src_abgr,
} }
} }
#endif #endif
#if defined(HAS_ABGRTOYJROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ABGRToYJRow = ABGRToYJRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ABGRToYJRow = ABGRToYJRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ABGRTOYJROW_MSA) #if defined(HAS_ABGRTOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) { if (TestCpuFlag(kCpuHasMSA)) {
ABGRToYJRow = ABGRToYJRow_Any_MSA; ABGRToYJRow = ABGRToYJRow_Any_MSA;

View File

@ -1072,6 +1072,9 @@ ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
#ifdef HAS_ARGBTOYROW_NEON #ifdef HAS_ARGBTOYROW_NEON
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15) ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_ARGBTOYROW_NEON_DOTPROD
ANY11(ARGBToYRow_Any_NEON_DotProd, ARGBToYRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_MSA #ifdef HAS_ARGBTOYROW_MSA
ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
#endif #endif
@ -1084,12 +1087,21 @@ ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
#ifdef HAS_ARGBTOYJROW_NEON #ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15) ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_ARGBTOYJROW_NEON_DOTPROD
ANY11(ARGBToYJRow_Any_NEON_DotProd, ARGBToYJRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_ABGRTOYJROW_NEON #ifdef HAS_ABGRTOYJROW_NEON
ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15) ANY11(ABGRToYJRow_Any_NEON, ABGRToYJRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_ABGRTOYJROW_NEON_DOTPROD
ANY11(ABGRToYJRow_Any_NEON_DotProd, ABGRToYJRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_RGBATOYJROW_NEON #ifdef HAS_RGBATOYJROW_NEON
ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15) ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_RGBATOYJROW_NEON_DOTPROD
ANY11(RGBAToYJRow_Any_NEON_DotProd, RGBAToYJRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYJROW_MSA #ifdef HAS_ARGBTOYJROW_MSA
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
#endif #endif
@ -1114,6 +1126,9 @@ ANY11(ABGRToYJRow_Any_LASX, ABGRToYJRow_LASX, 0, 4, 1, 31)
#ifdef HAS_BGRATOYROW_NEON #ifdef HAS_BGRATOYROW_NEON
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15) ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_BGRATOYROW_NEON_DOTPROD
ANY11(BGRAToYRow_Any_NEON_DotProd, BGRAToYRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_BGRATOYROW_MSA #ifdef HAS_BGRATOYROW_MSA
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
#endif #endif
@ -1126,6 +1141,9 @@ ANY11(BGRAToYRow_Any_LASX, BGRAToYRow_LASX, 0, 4, 1, 31)
#ifdef HAS_ABGRTOYROW_NEON #ifdef HAS_ABGRTOYROW_NEON
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15) ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_ABGRTOYROW_NEON_DOTPROD
ANY11(ABGRToYRow_Any_NEON_DotProd, ABGRToYRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_ABGRTOYROW_MSA #ifdef HAS_ABGRTOYROW_MSA
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
#endif #endif
@ -1138,6 +1156,9 @@ ANY11(ABGRToYRow_Any_LASX, ABGRToYRow_LASX, 0, 4, 1, 31)
#ifdef HAS_RGBATOYROW_NEON #ifdef HAS_RGBATOYROW_NEON
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_RGBATOYROW_NEON_DOTPROD
ANY11(RGBAToYRow_Any_NEON_DotProd, RGBAToYRow_NEON_DotProd, 0, 4, 1, 15)
#endif
#ifdef HAS_RGBATOYROW_MSA #ifdef HAS_RGBATOYROW_MSA
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
#endif #endif

View File

@ -3029,12 +3029,49 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
"v17"); "v17");
} }
void
ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct RgbConstants* rgbconstants) {
asm volatile(
"ldr d0, [%3] \n" // load rgbconstants
"dup v16.4s, v0.s[0] \n"
"dup v17.8h, v0.h[2] \n"
"1: \n"
"ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"movi v0.16b, #0 \n"
"movi v1.16b, #0 \n"
"movi v2.16b, #0 \n"
"movi v3.16b, #0 \n"
"udot v0.4s, v4.16b, v16.16b \n"
"udot v1.4s, v5.16b, v16.16b \n"
"udot v2.4s, v6.16b, v16.16b \n"
"udot v3.4s, v7.16b, v16.16b \n"
"uzp1 v0.8h, v0.8h, v1.8h \n"
"uzp1 v1.8h, v2.8h, v3.8h \n"
"addhn v0.8b, v0.8h, v17.8h \n"
"addhn v1.8b, v1.8h, v17.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(rgbconstants) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
}
// RGB to JPeg coefficients // RGB to JPeg coefficients
// B * 0.1140 coefficient = 29 // B * 0.1140 coefficient = 29
// G * 0.5870 coefficient = 150 // G * 0.5870 coefficient = 150
// R * 0.2990 coefficient = 77 // R * 0.2990 coefficient = 77
// Add 0.5 = 0x80 // Add 0.5 = 0x80
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128}; static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77},
128};
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128}; static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
@ -3046,8 +3083,12 @@ static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
0x1080}; 0x1080};
static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66},
0x1080};
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25},
0x1080};
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
@ -3065,6 +3106,30 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
} }
void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants);
}
void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_yj,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants);
}
void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_y,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants);
}
void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_yj,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants);
}
// RGBA expects first value to be A and ignored, then 3 values to contain RGB. // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
// Same code as ARGB, except the LD4 // Same code as ARGB, except the LD4
void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
@ -3112,6 +3177,33 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
} }
void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_y,
int width) {
// No need for a separate implementation for RGBA inputs, just permute the
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
&kRgb24I601DotProdConstants);
}
void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_yj,
int width) {
// No need for a separate implementation for RGBA inputs, just permute the
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
&kRgb24JPEGDotProdConstants);
}
void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
uint8_t* dst_y,
int width) {
// No need for a separate implementation for RGBA inputs, just permute the
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
&kRawI601DotProdConstants);
}
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,