[libyuv] Remove all x86 SSE optimizations

Removed all SSE functions, macros, dispatching logic, and related
unit tests across the repository to reduce code size and complexity.
Left cpuid detection intact. Supported architectures like AVX2, NEON,
SVE, etc. are unaffected.

R=rrwinterton@gmail.com

Bug: None
Test: Build and run libyuv_unittest
Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f
This commit is contained in:
Frank Barchard 2026-04-29 17:06:56 -07:00
parent f2ac6db694
commit 36e0fd216b
29 changed files with 2031 additions and 2357 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1934
Version: 1928
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -456,40 +456,6 @@ int ARGBToUYVY(const uint8_t* src_argb,
int width,
int height);
// RAW to NV21 with Matrix
LIBYUV_API
int RAWToNV21Matrix(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
int dst_stride_vu,
const struct ArgbConstants* argbconstants,
int width,
int height);
// RAW to NV21
LIBYUV_API
int RAWToNV21(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height);
// RGB24 to NV12
LIBYUV_API
int RGB24ToNV12(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// RAW to JNV21 full range NV21
LIBYUV_API
int RAWToJNV21(const uint8_t* src_raw,

View File

@ -487,9 +487,6 @@ int NV21ToNV12(const uint8_t* src_y,
int width,
int height);
// Alias
#define NV12ToNV21 NV21ToNV12
LIBYUV_API
int YUY2ToY(const uint8_t* src_yuy2,
int src_stride_yuy2,

View File

@ -140,13 +140,6 @@ extern "C" {
// The following are available on all x86 platforms, but
// require VS2012, clang 3.4 or gcc 4.7.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__) || \
defined(_M_X64) || defined(_M_X86))
#define HAS_ARGBTOUVMATRIXROW_AVX2
#define HAS_MERGEUVROW_AVX2
#endif
#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
(defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
defined(GCC_HAS_AVX2))
@ -170,6 +163,7 @@ extern "C" {
#define HAS_I444TORGB24ROW_AVX2
#define HAS_INTERPOLATEROW_AVX2
#define HAS_J422TOARGBROW_AVX2
#define HAS_MERGEUVROW_AVX2
#define HAS_MIRRORROW_AVX2
#define HAS_NV12TOARGBROW_AVX2
#define HAS_NV12TORGB24ROW_AVX2
@ -200,6 +194,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \
(defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_ENABLE_ROWWIN)
#define HAS_RAWTOYJROW_SSSE3
#define HAS_AB64TOARGBROW_SSSE3
#define HAS_ABGRTOAR30ROW_SSSE3
#define HAS_ABGRTOYJROW_SSSE3
@ -250,9 +245,11 @@ extern "C" {
// TODO: port row_win to use 8 bit coefficients.
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_ARGBTOYMATRIXROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#define HAS_RAWTOYROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#define HAS_RGB24TOYJROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
// TODO: adjust row_win to use 8 bit negative coefficients.
@ -300,7 +297,6 @@ extern "C" {
#define HAS_ARGBTOUV444MATRIXROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOYMATRIXROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
@ -334,6 +330,8 @@ extern "C" {
#define HAS_P210TOARGBROW_AVX2
#define HAS_P410TOAR30ROW_AVX2
#define HAS_P410TOARGBROW_AVX2
#define HAS_RAWTOYJROW_AVX2
#define HAS_RGB24TOYJROW_AVX2
#define HAS_RGBATOYJROW_AVX2
#define HAS_SPLITARGBROW_AVX2
#define HAS_SPLITRGBROW_AVX2
@ -356,13 +354,7 @@ extern "C" {
defined(_M_X64) || defined(_M_X86)) && \
((defined(_MSC_VER) && !defined(__clang__)) || \
defined(LIBYUV_ENABLE_ROWWIN))
#define HAS_RAWTOARGBROW_AVX2
#if defined(__x86_64__) || defined(_M_X64)
#define HAS_RAWTOARGBROW_AVX512BW
#define HAS_RGB24TOARGBROW_AVX512BW
#endif
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOYMATRIXROW_AVX2
#define HAS_ABGRTOYROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ABGRTOYJROW_AVX2
@ -378,10 +370,6 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512) && \
!defined(LIBYUV_ENABLE_ROWWIN)
#define HAS_COPYROW_AVX512BW
#if defined(__x86_64__) || defined(_M_X64)
#define HAS_RAWTOARGBROW_AVX512BW
#define HAS_RGB24TOARGBROW_AVX512BW
#endif
#define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_CONVERT16TO8ROW_AVX512BW
#define HAS_MERGEUVROW_AVX512BW
@ -395,7 +383,6 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_AVX512BW
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
#define HAS_ARGBTOYROW_AVX512BW
#define HAS_ARGBTOYMATRIXROW_AVX512BW
#define HAS_ARGBTOUVJ444ROW_AVX512BW
#define HAS_ARGBTOUVROW_AVX512BW
#define HAS_ARGBTOUVJROW_AVX512BW
@ -433,7 +420,6 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_NEON
#define HAS_ARGBTOUVJ444ROW_NEON
#define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVMATRIXROW_NEON
#define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON
#if !defined(__aarch64__)
@ -496,9 +482,13 @@ extern "C" {
#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTOUVJROW_NEON
#define HAS_RAWTOUVROW_NEON
#define HAS_RAWTOYJROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RGB24TOUVJROW_NEON
#define HAS_RGB24TOUVROW_NEON
#define HAS_RGB24TOYJROW_NEON
#define HAS_RGB24TOYROW_NEON
#define HAS_RGB565TOARGBROW_NEON
#define HAS_RGB565TOUVROW_NEON
#define HAS_RGB565TOYROW_NEON
@ -569,7 +559,6 @@ extern "C" {
#define HAS_ARGBSEPIAROW_NEON_DOTPROD
#define HAS_ARGBTOYJROW_NEON_DOTPROD
#define HAS_ARGBTOYROW_NEON_DOTPROD
#define HAS_ARGBTOYMATRIXROW_NEON_DOTPROD
#define HAS_BGRATOYROW_NEON_DOTPROD
#define HAS_RGBATOYJROW_NEON_DOTPROD
#define HAS_RGBATOYROW_NEON_DOTPROD
@ -580,7 +569,6 @@ extern "C" {
#define HAS_ARGBTOUV444ROW_NEON_I8MM
#define HAS_ARGBTOUVJ444ROW_NEON_I8MM
#define HAS_ARGBTOUVJROW_NEON_I8MM
#define HAS_ARGBTOUVMATRIXROW_NEON_I8MM
#define HAS_ARGBTOUVROW_NEON_I8MM
#define HAS_BGRATOUVROW_NEON_I8MM
#define HAS_RGBATOUVROW_NEON_I8MM
@ -596,7 +584,6 @@ extern "C" {
#define HAS_ARGBTORGB565DITHERROW_SVE2
#define HAS_ARGBTORGB565ROW_SVE2
#define HAS_ARGBTOUVJROW_SVE2
#define HAS_ARGBTOUVMATRIXROW_SVE2
#define HAS_ARGBTOUVROW_SVE2
#define HAS_AYUVTOUVROW_SVE2
#define HAS_AYUVTOVUROW_SVE2
@ -648,7 +635,6 @@ extern "C" {
#define HAS_ABGRTOUVROW_SME
#define HAS_ARGBMULTIPLYROW_SME
#define HAS_ARGBTOUVJROW_SME
#define HAS_ARGBTOUVMATRIXROW_SME
#define HAS_ARGBTOUVROW_SME
#define HAS_BGRATOUVROW_SME
#define HAS_CONVERT16TO8ROW_SME
@ -757,8 +743,10 @@ extern "C" {
#define HAS_RAWTOARGBROW_LSX
#define HAS_RAWTORGB24ROW_LSX
#define HAS_RAWTOUVROW_LSX
#define HAS_RAWTOYROW_LSX
#define HAS_RGB24TOARGBROW_LSX
#define HAS_RGB24TOUVROW_LSX
#define HAS_RGB24TOYROW_LSX
#define HAS_RGB565TOARGBROW_LSX
#define HAS_RGB565TOUVROW_LSX
#define HAS_RGB565TOYROW_LSX
@ -778,9 +766,10 @@ extern "C" {
#define HAS_YUY2TOUV422ROW_LSX
#define HAS_YUY2TOYROW_LSX
#define HAS_ARGBTOYROW_LSX
#define HAS_ARGBTOYMATRIXROW_LSX
#define HAS_ABGRTOYJROW_LSX
#define HAS_RGBATOYJROW_LSX
#define HAS_RGB24TOYJROW_LSX
#define HAS_RAWTOYJROW_LSX
#endif
#if !defined(LIBYUV_DISABLE_LSX) && defined(__loongarch_sx)
@ -813,7 +802,6 @@ extern "C" {
#define HAS_ARGBTOUVROW_LASX
#define HAS_ARGBTOYJROW_LASX
#define HAS_ARGBTOYROW_LASX
#define HAS_ARGBTOYMATRIXROW_LASX
#define HAS_ABGRTOYJROW_LASX
#define HAS_ABGRTOYROW_LASX
#define HAS_I422ALPHATOARGBROW_LASX
@ -832,8 +820,10 @@ extern "C" {
#define HAS_NV21TOARGBROW_LASX
#define HAS_RAWTOARGBROW_LASX
#define HAS_RAWTOUVROW_LASX
#define HAS_RAWTOYROW_LASX
#define HAS_RGB24TOARGBROW_LASX
#define HAS_RGB24TOUVROW_LASX
#define HAS_RGB24TOYROW_LASX
#define HAS_RGB565TOARGBROW_LASX
#define HAS_RGB565TOUVROW_LASX
#define HAS_RGB565TOYROW_LASX
@ -846,6 +836,8 @@ extern "C" {
#define HAS_RGBATOYROW_LASX
#define HAS_RGBATOYJROW_LASX
#define HAS_BGRATOYROW_LASX
#define HAS_RGB24TOYJROW_LASX
#define HAS_RAWTOYJROW_LASX
#endif
#if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector)
@ -875,6 +867,10 @@ extern "C" {
#define HAS_BGRATOYROW_RVV
#define HAS_COPYROW_RVV
#define HAS_INTERPOLATEROW_RVV
#define HAS_RAWTOYJROW_RVV
#define HAS_RAWTOYROW_RVV
#define HAS_RGB24TOYJROW_RVV
#define HAS_RGB24TOYROW_RVV
#define HAS_RGBATOYJROW_RVV
#define HAS_RGBATOYMATRIXROW_RVV
#define HAS_RGBATOYROW_RVV
@ -896,7 +892,8 @@ extern "C" {
// __riscv_vcreate_v_u8m2x3
// __riscv_vcreate_v_u8m2x4
// __riscv_vcreate_v_u8m4x2
#if defined(LIBYUV_RVV_HAS_VCREATE)
#if !defined(LIBYUV_RVV_HAS_TUPLE_TYPE) || \
(defined(LIBYUV_RVV_HAS_TUPLE_TYPE) && defined(LIBYUV_RVV_HAS_VCREATE))
#define HAS_AB64TOARGBROW_RVV
#define HAS_AR64TOAB64ROW_RVV
#define HAS_ARGBATTENUATEROW_RVV
@ -1779,6 +1776,12 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGBAToYRow_AVX512BW(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
@ -1844,43 +1847,6 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_Any_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_Any_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVMatrixRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c);
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -2131,6 +2097,10 @@ void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_y,
int width);
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
@ -2141,19 +2111,31 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void ABGRToYRow_LASX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width);
void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
@ -2215,42 +2197,6 @@ void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb,
int width,
const struct ArgbConstants* c);
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_Any_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_Any_LSX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c);
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -2305,6 +2251,10 @@ void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
@ -2324,6 +2274,14 @@ void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@ -2352,6 +2310,10 @@ void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_NEON_DotProd(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@ -2365,21 +2327,29 @@ void ABGRToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB565ToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYJRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_LSX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RGB565ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToYJRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGB1555ToYRow_Any_LASX(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@ -4029,7 +3999,6 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
int width);
void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width);
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
@ -4121,9 +4090,6 @@ void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);

View File

@ -2019,7 +2019,7 @@ static const int8_t kABGRToUVJCoefficients[] = {
43, 85, -128, 0, -128, 107, 21, 0,
};
#define ARGBTOUVMATRIX_SVE \
#define ABCDTOUVMATRIX_SVE \
"ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \
"ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \
"ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \
@ -2113,7 +2113,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb,
"ptrue p4.d \n"
"ptrue p5.h \n"
"1: \n" //
ARGBTOUVMATRIX_SVE
ABCDTOUVMATRIX_SVE
"b.gt 1b \n"
"2: \n"
@ -2126,7 +2126,7 @@ static inline void ARGBToUVMatrixRow_SVE_SC(const uint8_t* src_argb,
"whilelt p3.d, %w[vl2], %w[width] \n"
"whilelt p4.d, %w[vl3], %w[width] \n"
"whilelt p5.h, wzr, %w[width] \n" //
ARGBTOUVMATRIX_SVE
ABCDTOUVMATRIX_SVE
"b.gt 3b \n"
"99: \n"

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1934
#define LIBYUV_VERSION 1928
#endif // INCLUDE_LIBYUV_VERSION_H_

BIN
psnr.o Normal file

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -3638,22 +3638,6 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RGB24ToARGBRow = RGB24ToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
@ -3688,7 +3672,8 @@ int RGB24ToARGB(const uint8_t* src_rgb24,
RGB24ToARGBRow = RGB24ToARGBRow_RVV;
}
#endif
for (y = 0; y < height; ++y) {
for (y = 0; y < height; ++y) {
RGB24ToARGBRow(src_rgb24, dst_argb, width);
src_rgb24 += src_stride_rgb24;
dst_argb += dst_stride_argb;
@ -3738,14 +3723,6 @@ int RAWToARGB(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;

View File

@ -199,70 +199,7 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u,
uint8_t* dst_v, int width,
const struct ArgbConstants* c) =
ARGBToUV444MatrixRow_C;
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
}
#endif
ARGBToUV444MatrixRow_C;
#if defined(HAS_ARGBTOUV444MATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_SSSE3;
@ -287,6 +224,14 @@ ARGBToUV444MatrixRow_C;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUV444MATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON;
@ -510,96 +455,7 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_v, int width,
const struct ArgbConstants* c) =
ARGBToUVMatrixRow_C;
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
ARGBToUVMatrixRow_C;
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@ -615,6 +471,14 @@ ARGBToUVMatrixRow_C;
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
height == 0) {
@ -795,7 +659,7 @@ int ARGBToNV12(const uint8_t* src_argb,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -877,96 +741,7 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_v, int width,
const struct ArgbConstants* c) =
ARGBToUVMatrixRow_C;
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
ARGBToUVMatrixRow_C;
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
@ -982,6 +757,14 @@ ARGBToUVMatrixRow_C;
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C;
@ -1006,7 +789,7 @@ ARGBToUVMatrixRow_C;
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -1240,7 +1023,7 @@ int ARGBToNV21(const uint8_t* src_argb,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -1460,7 +1243,7 @@ int ABGRToNV12(const uint8_t* src_abgr,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -1673,7 +1456,7 @@ int ABGRToNV21(const uint8_t* src_abgr,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -4117,93 +3900,41 @@ int ARGBToAB64(const uint8_t* src_argb,
return 0;
}
// Convert RAW to NV21 with Matrix.
// Enabled if 1 pass is available
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
#define HAS_RAWTOYJROW
#endif
// RAW to JNV21 full range NV21
LIBYUV_API
int RAWToNV21Matrix(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
int dst_stride_vu,
const struct ArgbConstants* argbconstants,
int width,
int height) {
int RAWToJNV21(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
#if defined(HAS_RAWTOYJROW)
void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
uint8_t* dst_uj, uint8_t* dst_vj, int width) =
RAWToUVJRow_C;
void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
RAWToYJRow_C;
#else
void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RAWToARGBRow_C;
void (*ARGBToUVMatrixRow)(const uint8_t* src_argb0, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_v, int width,
const struct ArgbConstants* c) =
ARGBToUVMatrixRow_C;
void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
uint8_t* dst_uj, uint8_t* dst_vj, int width) =
ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYJRow_C;
#endif
void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj,
uint8_t* dst_vu, int width) = MergeUVRow_C;
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
}
#endif
if (!src_raw || !dst_y || !dst_vu || !argbconstants || width <= 0 || height == 0) {
if (!src_raw || !dst_y || !dst_vu || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
@ -4213,6 +3944,44 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
src_stride_raw = -src_stride_raw;
}
#if defined(HAS_RAWTOYJROW)
// Neon version does direct RAW to YUV.
#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToUVJRow = RAWToUVJRow_Any_NEON;
RAWToYJRow = RAWToYJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
RAWToYJRow = RAWToYJRow_NEON;
RAWToUVJRow = RAWToUVJRow_NEON;
}
}
#endif
#if defined(HAS_RAWTOYJROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
RAWToYJRow = RAWToYJRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
RAWToYJRow = RAWToYJRow_LSX;
}
}
#endif
#if defined(HAS_RAWTOYJROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
RAWToYJRow = RAWToYJRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
RAWToYJRow = RAWToYJRow_LASX;
}
}
#endif
#if defined(HAS_RAWTOYJROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
RAWToYJRow = RAWToYJRow_RVV;
}
#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else // HAS_RAWTOYJROW
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@ -4229,99 +3998,47 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_RAWTOARGBROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
RAWToARGBRow = RAWToARGBRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
RAWToARGBRow = RAWToARGBRow_AVX512BW;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToARGBRow = RAWToARGBRow_NEON;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
RAWToARGBRow = RAWToARGBRow_SVE2;
}
#endif
#if defined(HAS_RAWTOARGBROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
RAWToARGBRow = RAWToARGBRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
RAWToARGBRow = RAWToARGBRow_LSX;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
RAWToARGBRow = RAWToARGBRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
RAWToARGBRow = RAWToARGBRow_LASX;
}
}
#endif
#if defined(HAS_RAWTOARGBROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
RAWToARGBRow = RAWToARGBRow_RVV;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM)
if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SVE2)
if (TestCpuFlag(kCpuHasSVE2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME;
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_SSSE3)
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW;
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW;
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToUVJRow = ARGBToUVJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToUVJRow = ARGBToUVJRow_AVX512BW;
}
}
#endif
#endif // HAS_RAWTOYJROW
#if defined(HAS_MERGEUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
MergeUVRow = MergeUVRow_Any_SSE2;
@ -4333,7 +4050,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -4372,86 +4089,58 @@ int RAWToNV21Matrix(const uint8_t* src_raw,
MergeUVRow = MergeUVRow_RVV;
}
#endif
{
// Allocate 2 rows of ARGB.
const int row_size = (width * 4 + 31) & ~31;
align_buffer_64(row, row_size * 2);
// Allocate 1 row of U and 1 row of V.
align_buffer_64(row_u, halfwidth);
align_buffer_64(row_v, halfwidth);
if (!row || !row_u || !row_v) {
free_aligned_buffer_64(row);
free_aligned_buffer_64(row_u);
free_aligned_buffer_64(row_v);
#if defined(HAS_RAWTOYJROW)
// Allocate a row of uv.
const int row_uv_size = ((halfwidth + 31) & ~31);
align_buffer_64(row_uj, row_uv_size * 2);
uint8_t* row_vj = row_uj + row_uv_size;
#else
// Allocate row of uv and 2 rows of ARGB.
const int row_size = ((width * 4 + 31) & ~31);
const int row_uv_size = ((halfwidth + 31) & ~31);
align_buffer_64(row_uj, row_uv_size * 2 + row_size * 2);
uint8_t* row_vj = row_uj + row_uv_size;
uint8_t* row = row_vj + row_uv_size;
#endif
if (!row_uj)
return 1;
}
for (y = 0; y < height - 1; y += 2) {
#if defined(HAS_RAWTOYJROW)
RAWToUVJRow(src_raw, src_stride_raw, row_uj, row_vj, width);
MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
RAWToYJRow(src_raw, dst_y, width);
RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
#else
RAWToARGBRow(src_raw, row, width);
RAWToARGBRow(src_raw + src_stride_raw, row + row_size, width);
ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants);
MergeUVRow(row_v, row_u, dst_vu, halfwidth);
ARGBToYMatrixRow(row, dst_y, width, argbconstants);
ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants);
ARGBToUVJRow(row, row_size, row_uj, row_vj, width);
MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
ARGBToYJRow(row, dst_y, width);
ARGBToYJRow(row + row_size, dst_y + dst_stride_y, width);
#endif
src_raw += src_stride_raw * 2;
dst_y += dst_stride_y * 2;
dst_vu += dst_stride_vu;
}
if (height & 1) {
#if defined(HAS_RAWTOYJROW)
RAWToUVJRow(src_raw, 0, row_uj, row_vj, width);
MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
RAWToYJRow(src_raw, dst_y, width);
#else
RAWToARGBRow(src_raw, row, width);
ARGBToUVMatrixRow(row, 0, row_u, row_v, width, argbconstants);
MergeUVRow(row_v, row_u, dst_vu, halfwidth);
ARGBToYMatrixRow(row, dst_y, width, argbconstants);
ARGBToUVJRow(row, 0, row_uj, row_vj, width);
MergeUVRow(row_vj, row_uj, dst_vu, halfwidth);
ARGBToYJRow(row, dst_y, width);
#endif
}
free_aligned_buffer_64(row_v);
free_aligned_buffer_64(row_u);
free_aligned_buffer_64(row);
free_aligned_buffer_64(row_uj);
}
return 0;
}
LIBYUV_API
int RAWToJNV21(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height) {
return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu,
dst_stride_vu, &kArgbJPEGConstants, width, height);
}
LIBYUV_API
int RAWToNV21(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_vu,
int dst_stride_vu,
int width,
int height) {
return RAWToNV21Matrix(src_raw, src_stride_raw, dst_y, dst_stride_y, dst_vu,
dst_stride_vu, &kArgbI601Constants, width, height);
}
LIBYUV_API
int RGB24ToNV12(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
return RAWToNV21Matrix(src_rgb24, src_stride_rgb24, dst_y, dst_stride_y,
dst_uv, dst_stride_uv, &kAbgrI601Constants, width,
height);
}
#undef HAS_RAWTOYJROW
#ifdef __cplusplus
} // extern "C"

View File

@ -693,7 +693,7 @@ void MergeUVPlane(const uint8_t* src_u,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
if (IS_ALIGNED(width, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}

View File

@ -101,11 +101,11 @@ void TransposeWx8_SSSE3(const uint8_t* src,
"movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)(src_stride)), // %3
"r"((ptrdiff_t)(dst_stride)) // %4
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -243,11 +243,11 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
"movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)(src_stride)), // %3
"r"((ptrdiff_t)(dst_stride)) // %4
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
"xmm15");
@ -356,13 +356,13 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
"movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(src_stride)), // %4
"r"((ptrdiff_t)(dst_stride_a)), // %5
"r"((ptrdiff_t)(dst_stride_b)) // %6
: "+r"(src), // %0
"+r"(dst_a), // %1
"+r"(dst_b), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride)), // %4
"r"((intptr_t)(dst_stride_a)), // %5
"r"((intptr_t)(dst_stride_b)) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7", "xmm8", "xmm9");
}

View File

@ -616,7 +616,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX512BW
ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
@ -1000,12 +1000,6 @@ ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7)
#if defined(HAS_RAWTOARGBROW_AVX2)
ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31)
#endif
#if defined(HAS_RAWTOARGBROW_AVX512BW)
ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63)
#endif
#if defined(HAS_RGB24TOARGBROW_AVX512BW)
ANY11(RGB24ToARGBRow_Any_AVX512BW, RGB24ToARGBRow_AVX512BW, 0, 3, 4, 63)
#endif
#if defined(HAS_RAWTORGBAROW_SSSE3)
ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15)
#endif
@ -1206,36 +1200,52 @@ ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
#endif
#ifdef HAS_RGB24TOYROW_NEON
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
#endif
#ifdef HAS_RGB24TOYJROW_AVX2
ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
#endif
#ifdef HAS_RGB24TOYJROW_SSSE3
ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
#endif
#ifdef HAS_RGB24TOYJROW_NEON
ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15)
#endif
#ifdef HAS_RGB24TOYROW_LSX
ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
#endif
#ifdef HAS_RGB24TOYJROW_LSX
ANY11(RGB24ToYJRow_Any_LSX, RGB24ToYJRow_LSX, 0, 3, 1, 15)
#endif
#ifdef HAS_RGB24TOYJROW_LASX
ANY11(RGB24ToYJRow_Any_LASX, RGB24ToYJRow_LASX, 0, 3, 1, 31)
#endif
#ifdef HAS_RGB24TOYROW_LASX
ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
#endif
#ifdef HAS_RAWTOYROW_NEON
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
#endif
#ifdef HAS_RAWTOYJROW_AVX2
ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
#endif
#ifdef HAS_RAWTOYJROW_SSSE3
ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
#endif
#ifdef HAS_RAWTOYJROW_NEON
ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15)
#endif
#ifdef HAS_RAWTOYROW_LSX
ANY11(RAWToYRow_Any_LSX, RAWToYRow_LSX, 0, 3, 1, 15)
#endif
#ifdef HAS_RAWTOYROW_LASX
ANY11(RAWToYRow_Any_LASX, RAWToYRow_LASX, 0, 3, 1, 31)
#endif
#ifdef HAS_RAWTOYJROW_LSX
ANY11(RAWToYJRow_Any_LSX, RAWToYJRow_LSX, 0, 3, 1, 15)
#endif
#ifdef HAS_RAWTOYJROW_LASX
ANY11(RAWToYJRow_Any_LASX, RAWToYJRow_LASX, 0, 3, 1, 31)
#endif
#ifdef HAS_RGB565TOYROW_NEON
ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 15)
@ -2264,12 +2274,6 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \
}
#ifdef HAS_ARGBTOUVMATRIXROW_NEON
ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM
ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15)
#endif
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15)
#endif
@ -2320,18 +2324,6 @@ ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
#ifdef HAS_ARGBTOYMATRIXROW_NEON
ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
#endif
#ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD
ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15)
#endif
#ifdef HAS_ARGBTOYMATRIXROW_LSX
ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15)
#endif
#ifdef HAS_ARGBTOYMATRIXROW_LASX
ANY11MC(ARGBToYMatrixRow_Any_LASX, ARGBToYMatrixRow_LASX, 4, 31)
#endif
#ifdef HAS_ARGBTOYMATRIXROW_RVV
ANY11MC(ARGBToYMatrixRow_Any_RVV, ARGBToYMatrixRow_RVV, 4, 15)
#endif
#undef ANY11MC
#ifdef HAS_ARGBTOUVROW_AVX2

View File

@ -678,6 +678,8 @@ MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
MAKEROWY(ABGR, 0, 1, 2, 4)
MAKEROWY(RGBA, 3, 2, 1, 4)
MAKEROWY(RGB24, 2, 1, 0, 3)
MAKEROWY(RAW, 0, 1, 2, 3)
#undef MAKEROWY
// JPeg uses BT.601-1 full range
@ -751,6 +753,8 @@ static __inline uint8_t RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
MAKEROWYJ(ARGB, 2, 1, 0, 4)
MAKEROWYJ(ABGR, 0, 1, 2, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4)
MAKEROWYJ(RGB24, 2, 1, 0, 3)
MAKEROWYJ(RAW, 0, 1, 2, 3)
#undef MAKEROWYJ
static __inline uint8_t RGBToYMatrix(uint8_t r,
@ -4375,21 +4379,69 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
#ifdef HAS_RGB24TOYJROW_AVX2
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
ARGBToYJRow_AVX2(row, dst_yj, twidth);
src_rgb24 += twidth * 3;
dst_yj += twidth;
width -= twidth;
}
}
#endif // HAS_RGB24TOYJROW_AVX2
#ifdef HAS_RAWTOYJROW_AVX2
// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
#ifdef HAS_RAWTOARGBROW_AVX2
RAWToARGBRow_AVX2(src_raw, row, twidth);
#else
RAWToARGBRow_SSSE3(src_raw, row, twidth);
#endif
ARGBToYJRow_AVX2(row, dst_yj, twidth);
src_raw += twidth * 3;
dst_yj += twidth;
width -= twidth;
}
}
#endif // HAS_RAWTOYJROW_AVX2
#ifdef HAS_RGB24TOYJROW_SSSE3
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
ARGBToYJRow_SSSE3(row, dst_yj, twidth);
src_rgb24 += twidth * 3;
dst_yj += twidth;
width -= twidth;
}
}
#endif // HAS_RGB24TOYJROW_SSSE3
#ifdef HAS_RAWTOYJROW_SSSE3
// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
RAWToARGBRow_SSSE3(src_raw, row, twidth);
ARGBToYJRow_SSSE3(row, dst_yj, twidth);
src_raw += twidth * 3;
dst_yj += twidth;
width -= twidth;
}
}
#endif // HAS_RAWTOYJROW_SSSE3

View File

@ -262,64 +262,6 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_RAWTOARGBROW_AVX512BW
static const uint32_t kPermdRAWToARGB_AVX512BW[16] = {
0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) {
asm volatile(
"vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff
"vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000
"movabs $0xffffffffffff,%%rax \n" // 48 bytes mask
"kmovq %%rax,%%k1 \n"
"vmovdqu32 %3,%%zmm5 \n"
"vbroadcasti32x4 %4,%%zmm4 \n"
LABELALIGN //
"1: \n"
"vmovdqu8 (%0),%%zmm0%{%%k1%}%{z%} \n"
"vmovdqu8 48(%0),%%zmm1%{%%k1%}%{z%} \n"
"vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n"
"vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n"
"lea 192(%0),%0 \n"
"vpermd %%zmm0,%%zmm5,%%zmm0 \n"
"vpermd %%zmm1,%%zmm5,%%zmm1 \n"
"vpermd %%zmm2,%%zmm5,%%zmm2 \n"
"vpermd %%zmm3,%%zmm5,%%zmm3 \n"
"vpshufb %%zmm4,%%zmm0,%%zmm0 \n"
"vpshufb %%zmm4,%%zmm1,%%zmm1 \n"
"vpshufb %%zmm4,%%zmm2,%%zmm2 \n"
"vpshufb %%zmm4,%%zmm3,%%zmm3 \n"
"vpord %%zmm6,%%zmm0,%%zmm0 \n"
"vpord %%zmm6,%%zmm1,%%zmm1 \n"
"vpord %%zmm6,%%zmm2,%%zmm2 \n"
"vpord %%zmm6,%%zmm3,%%zmm3 \n"
"vmovdqu32 %%zmm0,(%1) \n"
"vmovdqu32 %%zmm1,0x40(%1) \n"
"vmovdqu32 %%zmm2,0x80(%1) \n"
"vmovdqu32 %%zmm3,0xc0(%1) \n"
"lea 0x100(%1),%1 \n"
"sub $0x40,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kPermdRAWToARGB_AVX512BW), // %3
"m"(*shuffler) // %4
: "memory", "cc", "rax", "k1", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6");
}
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width);
}
void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width);
}
#endif
// Same code as RAWToARGB with different shuffler and A in low bits
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile(
@ -1913,9 +1855,9 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
#else
"+rm"(width) // %3
#endif
: "r"((ptrdiff_t)(src_stride_argb)), // %4
"r"(c), // %5
"m"(kShuffleAARRGGBB) // %6
: "r"((intptr_t)(src_stride_argb)), // %4
"r"(c), // %5
"m"(kShuffleAARRGGBB) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -1988,9 +1930,9 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
#else
"+rm"(width) // %3
#endif
: "r"((ptrdiff_t)(src_stride_argb)), // %4
"r"(c), // %5
"m"(kShuffleAARRGGBB) // %6
: "r"((intptr_t)(src_stride_argb)), // %4
"r"(c), // %5
"m"(kShuffleAARRGGBB) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -2293,11 +2235,11 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb,
#else
"+rm"(width) // %3
#endif
: "r"((ptrdiff_t)(src_stride_argb)), // %4
"r"(c), // %5
"m"(kShuffleAARRGGBB), // %6
"m"(kPermdARGBToY_AVX512BW), // %7
"m"(kPermdARGBToUV_AVX512BW) // %8
: "r"((intptr_t)(src_stride_argb)), // %4
"r"(c), // %5
"m"(kShuffleAARRGGBB), // %6
"m"(kPermdARGBToY_AVX512BW), // %7
"m"(kPermdARGBToUV_AVX512BW) // %8
: "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
"zmm7", "zmm16", "zmm17", "zmm18", "zmm19");
}
@ -4649,7 +4591,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile("movdqa %3,%%xmm5 \n"
LABELALIGN
@ -4670,7 +4612,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
#ifdef HAS_MIRRORROW_AVX2
void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile("vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
@ -4697,7 +4639,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile("movdqa %3,%%xmm5 \n"
LABELALIGN
@ -4718,7 +4660,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
#ifdef HAS_MIRRORUVROW_AVX2
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile("vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN
@ -4747,7 +4689,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile(
"movdqa %4,%%xmm1 \n"
"lea -0x10(%0,%3,2),%0 \n"
@ -4786,7 +4728,7 @@ static const uvec8 kShuffleMirrorRGB1 = {
void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
src_rgb24 += width * 3 - 48;
asm volatile(
"movdqa %3,%%xmm4 \n"
@ -4822,7 +4764,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
#ifdef HAS_ARGBMIRRORROW_SSE2
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile("lea -0x10(%0,%2,4),%0 \n"
LABELALIGN
@ -4846,7 +4788,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
// Shuffle table for reversing the bytes.
static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
ptrdiff_t temp_width = (ptrdiff_t)(width);
intptr_t temp_width = (intptr_t)(width);
asm volatile("vmovdqu %3,%%ymm5 \n"
LABELALIGN
@ -6867,10 +6809,10 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)(stride_yuy2)) // %3
: "+r"(src_yuy2), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"((intptr_t)(stride_yuy2)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
@ -6906,11 +6848,11 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(stride_yuy2)) // %4
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((intptr_t)(stride_yuy2)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
@ -7001,11 +6943,11 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(stride_uyvy)) // %4
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((intptr_t)(stride_uyvy)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
@ -7092,10 +7034,10 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"((ptrdiff_t)(stride_yuy2)) // %3
: "+r"(src_yuy2), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"((intptr_t)(stride_yuy2)) // %3
: "memory", "cc", "xmm0", "xmm1");
}
@ -7132,11 +7074,11 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(stride_yuy2)) // %4
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((intptr_t)(stride_yuy2)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
@ -7232,11 +7174,11 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(stride_uyvy)) // %4
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"((intptr_t)(stride_uyvy)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
@ -8596,12 +8538,12 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
"sub $0x1,%3 \n"
"jge 10b \n"
"19: \n"
: "+r"(topleft), // %0
"+r"(botleft), // %1
"+r"(dst), // %2
"+rm"(count) // %3
: "r"((ptrdiff_t)(width)), // %4
"rm"(area) // %5
: "+r"(topleft), // %0
"+r"(botleft), // %1
"+r"(dst), // %2
"+rm"(count) // %3
: "r"((intptr_t)(width)), // %4
"rm"(area) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
@ -8614,7 +8556,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
const float* src_dudv,
int width) {
ptrdiff_t src_argb_stride_temp = src_argb_stride;
intptr_t src_argb_stride_temp = src_argb_stride;
intptr_t temp;
asm volatile(
"movq (%3),%%xmm2 \n"
@ -8766,11 +8708,11 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr,
"jg 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+rm"(width), // %2
"+r"(source_y_fraction) // %3
: "r"(src_stride) // %4
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+rm"(width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_INTERPOLATEROW_SSSE3
@ -8844,11 +8786,11 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
"99: \n"
"vzeroupper \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(width), // %2
"+r"(source_y_fraction) // %3
: "r"(src_stride) // %4
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(width), // %2
"+r"(source_y_fraction) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
}
#endif // HAS_INTERPOLATEROW_AVX2
@ -9678,12 +9620,12 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
"lea 0x10(%2),%2 \n"
"sub $0x10,%3 \n" // 16 src pixels per loop
"jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(src_stride_u)), // %4
"r"((ptrdiff_t)(src_stride_v)) // %5
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride_u)), // %4
"r"((intptr_t)(src_stride_v)) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
@ -9724,12 +9666,12 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
"sub $0x20,%3 \n" // 32 src pixels per loop
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"((ptrdiff_t)(src_stride_u)), // %4
"r"((ptrdiff_t)(src_stride_v)) // %5
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"((intptr_t)(src_stride_u)), // %4
"r"((intptr_t)(src_stride_v)) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}

View File

@ -2013,24 +2013,24 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y,
}
}
#ifndef ArgbConstants
struct ArgbConstants {
#ifndef RgbConstants
struct RgbConstants {
uint8_t kRGBToY[4];
uint16_t kAddY;
uint16_t pad;
};
#define ArgbConstants ArgbConstants
#define RgbConstants RgbConstants
// RGB to JPeg coefficients
// B * 0.1140 coefficient = 29
// G * 0.5870 coefficient = 150
// R * 0.2990 coefficient = 77
// Add 0.5 = 0x80
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
128,
0};
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
// RGB to BT.601 coefficients
// B * 0.1016 coefficient = 25
@ -2038,20 +2038,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}
// R * 0.2578 coefficient = 66
// Add 16.5 = 0x1080
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
0x1080,
0};
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
0x1080,
0};
#endif // ArgbConstants
#endif // RgbConstants
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
@ -2088,7 +2088,7 @@ void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
: "+&r"(src_argb), // %0
"+&r"(dst_y), // %1
"+&r"(width) // %2
: "r"(c), "r"(shuff)
: "r"(rgbconstants), "r"(shuff)
: "memory");
}
@ -2113,7 +2113,7 @@ void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
@ -2150,7 +2150,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
: "+&r"(src_rgba), // %0
"+&r"(dst_y), // %1
"+&r"(width) // %2
: "r"(c), "r"(shuff)
: "r"(rgbconstants), "r"(shuff)
: "memory");
}
@ -2169,7 +2169,7 @@ void BGRAToYRow_LASX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
int8_t shuff[128] = {
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
@ -2219,14 +2219,26 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
: "+&r"(src_rgba), // %0
"+&r"(dst_y), // %1
"+&r"(width) // %2
: "r"(c), // %3
: "r"(rgbconstants), // %3
"r"(shuff) // %4
: "memory");
}
void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_LASX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
}
void RAWToYJRow_LASX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_LASX(src_raw, dst_yj, width, &kRawJPEGConstants);
}
void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
RGBToYMatrixRow_LASX(src_rgb24, dst_y, width, &kRgb24I601Constants);
}
void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
RGBToYMatrixRow_LASX(src_raw, dst_y, width, &kRawI601Constants);
}
void ARGBToUVJRow_LASX(const uint8_t* src_argb,
int src_stride_argb,

View File

@ -2798,24 +2798,24 @@ void HalfFloatRow_LSX(const uint16_t* src,
}
}
#ifndef ArgbConstants
struct ArgbConstants {
#ifndef RgbConstants
struct RgbConstants {
uint8_t kRGBToY[4];
uint16_t kAddY;
uint16_t pad;
};
#define ArgbConstants ArgbConstants
#define RgbConstants RgbConstants
// RGB to JPeg coefficients
// B * 0.1140 coefficient = 29
// G * 0.5870 coefficient = 150
// R * 0.2990 coefficient = 77
// Add 0.5 = 0x80
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
128,
0};
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
// RGB to BT.601 coefficients
// B * 0.1016 coefficient = 25
@ -2823,20 +2823,20 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}
// R * 0.2578 coefficient = 66
// Add 16.5 = 0x1080
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
0x1080,
0};
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0},
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
0x1080,
0};
#endif // ArgbConstants
#endif // RgbConstants
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
@ -2870,7 +2870,7 @@ void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
: "+&r"(src_argb), // %0
"+&r"(dst_y), // %1
"+&r"(width) // %2
: "r"(c)
: "r"(rgbconstants)
: "memory");
}
@ -2895,7 +2895,7 @@ void ABGRToYJRow_LSX(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
@ -2929,7 +2929,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
: "+&r"(src_rgba), // %0
"+&r"(dst_y), // %1
"+&r"(width) // %2
: "r"(c)
: "r"(rgbconstants)
: "memory");
}
@ -2948,7 +2948,7 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18,
20, 21, 23, 24, 26, 27, 29, 30, 0, 1, 3, 4, 6,
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
@ -2990,14 +2990,26 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
: "+&r"(src_rgba), // %0
"+&r"(dst_y), // %1
"+&r"(width) // %2
: "r"(c), // %3
: "r"(rgbconstants), // %3
"r"(shuff) // %4
: "memory");
}
void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_LSX(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
}
void RAWToYJRow_LSX(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_LSX(src_raw, dst_yj, width, &kRawJPEGConstants);
}
void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
RGBToYMatrixRow_LSX(src_rgb24, dst_y, width, &kRgb24I601Constants);
}
void RAWToYRow_LSX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
RGBToYMatrixRow_LSX(src_raw, dst_y, width, &kRawI601Constants);
}
// undef for unified sources build
#undef YUVTORGB_SETUP

View File

@ -1918,72 +1918,6 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
// clang-format on
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_argb
"vld1.8 {d18}, [%5] \n" // load kRGBToU
"vld1.8 {d19}, [%6] \n" // load kRGBToV
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19)
"vdup.16 q10, d16[0] \n" // U0
"vdup.16 q11, d16[1] \n" // U1
"vdup.16 q12, d16[2] \n" // U2
"vdup.16 q13, d18[0] \n" // V0
"vdup.16 q14, d18[1] \n" // V1
"vdup.16 q15, d18[2] \n" // V2
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
"vmov.u16 q3, #0x8000 \n" // 128.0
"vmul.s16 q8, q0, q10 \n" // U = B * U0
"vmla.s16 q8, q1, q11 \n" // U += G * U1
"vmla.s16 q8, q2, q12 \n" // U += R * U2
"vmul.s16 q9, q0, q13 \n" // V = B * V0
"vmla.s16 q9, q1, q14 \n" // V += G * V1
"vmla.s16 q9, q2, q15 \n" // V += R * V2
"vsub.u16 q8, q3, q8 \n" // 128.0 - U
"vsub.u16 q9, q3, q9 \n" // 128.0 - V
"vqshrn.u16 d0, q8, #8 \n" // Saturating shift right
"vqshrn.u16 d1, q9, #8 \n"
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_stride_argb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
: "r"(&c->kRGBToU), // %5
"r"(&c->kRGBToV) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -2896,7 +2830,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
}
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
@ -2931,9 +2865,21 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
"q12");
}
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants);
}
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants);
}
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants);
}
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants);
}
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8_t* dst_ptr,

View File

@ -9,7 +9,6 @@
*/
#include "libyuv/row.h"
#include "libyuv/convert_from_argb.h"
#ifdef __cplusplus
namespace libyuv {
@ -2894,26 +2893,14 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"ldr q16, [%[c], #16] \n" // kRGBToU
"ldr q17, [%[c], #32] \n" // kRGBToV
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
"dup v20.8h, v16.h[0] \n" // U0 (-BU)
"dup v21.8h, v16.h[1] \n" // U1 (-GU)
"dup v22.8h, v16.h[2] \n" // U2 (-RU)
"dup v23.8h, v17.h[0] \n" // V0 (-BV)
"dup v24.8h, v17.h[1] \n" // V1 (-GV)
"dup v26.8h, v17.h[2] \n" // V2 (-RV)
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
@ -2922,7 +2909,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
@ -2932,20 +2919,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
// U = B*U0 + G*U1 + R*U2
"mul v3.8h, v0.8h, v20.8h \n"
"mla v3.8h, v1.8h, v21.8h \n"
"mla v3.8h, v2.8h, v22.8h \n"
// V = B*V0 + G*V1 + R*V2
"mul v4.8h, v0.8h, v23.8h \n"
"mla v4.8h, v1.8h, v24.8h \n"
"mla v4.8h, v2.8h, v26.8h \n"
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
"subhn v0.8b, v25.8h, v3.8h \n"
"subhn v1.8b, v25.8h, v4.8h \n"
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
@ -2954,21 +2928,12 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
: [c] "r"(c) // %5
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void ARGBToUVRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
&kArgbI601Constants);
}
void ARGBToUVJRow_NEON(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
@ -3484,7 +3449,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
}
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
static void ABCDToUVMatrixRow_NEON_I8MM(const uint8_t* src,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
@ -3581,25 +3546,12 @@ static const int8_t kRGBAToUVCoefficients[] = {
0, -112, 74, 38, 0, 18, 94, -112,
};
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
}
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients);
}
@ -3608,7 +3560,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients);
}
@ -3617,7 +3569,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
ABCDToUVMatrixRow_NEON_I8MM(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients);
}
@ -3626,7 +3578,7 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
ABCDToUVMatrixRow_NEON_I8MM(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients);
}
@ -3654,7 +3606,7 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
ABCDToUVMatrixRow_NEON_I8MM(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients);
}
@ -3663,7 +3615,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
ABCDToUVMatrixRow_NEON_I8MM(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVJCoefficients);
}
@ -3763,20 +3715,22 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
}
struct RgbConstants {
uint8_t kRGBToY[4];
uint16_t kAddY;
};
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"ldr d0, [%3] \n" // load rgbconstants
"dup v6.16b, v0.b[0] \n"
"dup v7.16b, v0.b[1] \n"
"dup v16.16b, v0.b[2] \n"
"dup v17.8h, v1.h[0] \n"
"dup v17.8h, v0.h[2] \n"
"1: \n"
"ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
// pixels.
@ -3795,21 +3749,20 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "r"(rgbconstants) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
}
void ARGBToYMatrixRow_NEON_DotProd(
static void ARGBToYMatrixRow_NEON_DotProd(
const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"ldr d0, [%3] \n" // load rgbconstants
"dup v16.4s, v0.s[0] \n"
"dup v17.8h, v1.h[0] \n"
"dup v17.8h, v0.h[2] \n"
"1: \n"
"ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16
// pixels.
@ -3831,7 +3784,7 @@ void ARGBToYMatrixRow_NEON_DotProd(
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "r"(rgbconstants) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
}
@ -3841,10 +3794,12 @@ void ARGBToYMatrixRow_NEON_DotProd(
// G * 0.5870 coefficient = 150
// R * 0.2990 coefficient = 77
// Add 0.5
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
0x0080};
static const struct RgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77},
0x0080};
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080};
// RGB to BT.601 coefficients
// B * 0.1016 coefficient = 25
@ -3852,11 +3807,14 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {},
// R * 0.2578 coefficient = 66
// Add 16.5 = 0x1080
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
0x1080};
static const struct RgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66},
0x1080};
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
static const struct RgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25},
0x1080};
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
@ -3903,14 +3861,13 @@ void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"ldr d0, [%3] \n" // load rgbconstants
"dup v6.16b, v0.b[0] \n"
"dup v7.16b, v0.b[1] \n"
"dup v16.16b, v0.b[2] \n"
"dup v17.8h, v1.h[0] \n"
"dup v17.8h, v0.h[2] \n"
"1: \n"
"ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
// pixels.
@ -3929,7 +3886,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "r"(rgbconstants) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
}
@ -3973,10 +3930,10 @@ void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
&kRawI601DotProdConstants);
}
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
const struct RgbConstants* rgbconstants) {
asm volatile(
"ldr d0, [%3] \n" // load rgbconstants
"dup v5.16b, v0.b[0] \n"
@ -4000,13 +3957,25 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
: "+r"(src_rgb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "r"(rgbconstants) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
}
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
}
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
}
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
}
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
}
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_NEON(uint8_t* dst_ptr,

File diff suppressed because it is too large Load Diff

View File

@ -1120,20 +1120,6 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
: "cc", "memory", "z0", "z1", "z2", "p0", "p1");
}
__arm_locally_streaming void ARGBToUVMatrixRow_SME(
const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
}
__arm_locally_streaming void ARGBToUVRow_SME(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,

View File

@ -217,19 +217,6 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y,
NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width);
}
void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
}
void ARGBToUVRow_SVE2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,

View File

@ -122,10 +122,8 @@ extern "C" {
#if defined(__clang__) || defined(__GNUC__)
#define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f")))
#else
#define LIBYUV_TARGET_AVX2
#define LIBYUV_TARGET_AVX512BW
#endif
LIBYUV_TARGET_AVX2
@ -212,197 +210,6 @@ LIBYUV_TARGET_AVX2
void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_AVX2(src_bgra, dst_y, width, &kBgraI601Constants);
}
#ifdef HAS_RAWTOARGBROW_AVX2
LIBYUV_TARGET_AVX2
void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
__m256i ymm_alpha = _mm256_set1_epi32(0xff000000);
__m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
__m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6);
__m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low);
__m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high);
while (width > 0) {
__m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw);
__m256i ymm0 = _mm256_castsi128_si256(xmm0);
ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1);
__m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24));
__m256i ymm1 = _mm256_castsi128_si256(xmm1);
ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1);
__m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48));
__m256i ymm2 = _mm256_castsi128_si256(xmm2);
ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1);
__m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68));
__m256i ymm3 = _mm256_castsi128_si256(xmm3);
ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1);
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2);
ymm0 = _mm256_or_si256(ymm0, ymm_alpha);
ymm1 = _mm256_or_si256(ymm1, ymm_alpha);
ymm2 = _mm256_or_si256(ymm2, ymm_alpha);
ymm3 = _mm256_or_si256(ymm3, ymm_alpha);
_mm256_storeu_si256((__m256i*)dst_argb, ymm0);
_mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1);
_mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2);
_mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3);
src_raw += 96;
dst_argb += 128;
width -= 32;
}
}
#endif
#ifdef HAS_RAWTOARGBROW_AVX512BW
LIBYUV_TARGET_AVX512BW
void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) {
__m512i zmm_alpha = _mm512_set1_epi32(0xff000000);
__m512i zmm_perm = _mm512_set_epi32(
12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0);
__m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler));
while (width > 0) {
__m512i zmm0 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw);
__m512i zmm1 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 48);
__m512i zmm2 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 96);
__m512i zmm3 = _mm512_maskz_loadu_epi8(0xffffffffffffull, src_raw + 144);
zmm0 = _mm512_permutexvar_epi32(zmm_perm, zmm0);
zmm1 = _mm512_permutexvar_epi32(zmm_perm, zmm1);
zmm2 = _mm512_permutexvar_epi32(zmm_perm, zmm2);
zmm3 = _mm512_permutexvar_epi32(zmm_perm, zmm3);
zmm0 = _mm512_shuffle_epi8(zmm0, zmm_shuf);
zmm1 = _mm512_shuffle_epi8(zmm1, zmm_shuf);
zmm2 = _mm512_shuffle_epi8(zmm2, zmm_shuf);
zmm3 = _mm512_shuffle_epi8(zmm3, zmm_shuf);
zmm0 = _mm512_or_si512(zmm0, zmm_alpha);
zmm1 = _mm512_or_si512(zmm1, zmm_alpha);
zmm2 = _mm512_or_si512(zmm2, zmm_alpha);
zmm3 = _mm512_or_si512(zmm3, zmm_alpha);
_mm512_storeu_si512(dst_argb, zmm0);
_mm512_storeu_si512(dst_argb + 64, zmm1);
_mm512_storeu_si512(dst_argb + 128, zmm2);
_mm512_storeu_si512(dst_argb + 192, zmm3);
src_raw += 192;
dst_argb += 256;
width -= 64;
}
}
LIBYUV_TARGET_AVX512BW
void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
__m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2);
RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width);
}
LIBYUV_TARGET_AVX512BW
void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
__m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0);
RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width);
}
#endif
#ifdef HAS_ARGBTOUVMATRIXROW_AVX2
LIBYUV_TARGET_AVX2 __attribute__((no_sanitize("cfi-icall")))
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
__m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
__m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
__m256i ymm_0101 = _mm256_set1_epi16(0x0101);
__m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15);
__m256i ymm_8000 = _mm256_set1_epi16((short)0x8000);
__m256i ymm_zero = _mm256_setzero_si256();
while (width > 0) {
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
__m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
__m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb));
__m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32));
ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf);
ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf);
ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf);
ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf);
ymm0 = _mm256_maddubs_epi16(ymm0, ymm_0101);
ymm1 = _mm256_maddubs_epi16(ymm1, ymm_0101);
ymm2 = _mm256_maddubs_epi16(ymm2, ymm_0101);
ymm3 = _mm256_maddubs_epi16(ymm3, ymm_0101);
ymm0 = _mm256_add_epi16(ymm0, ymm2);
ymm1 = _mm256_add_epi16(ymm1, ymm3);
ymm0 = _mm256_srli_epi16(ymm0, 1);
ymm1 = _mm256_srli_epi16(ymm1, 1);
ymm0 = _mm256_avg_epu16(ymm0, ymm_zero);
ymm1 = _mm256_avg_epu16(ymm1, ymm_zero);
ymm0 = _mm256_packus_epi16(ymm0, ymm1);
ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
ymm1 = _mm256_maddubs_epi16(ymm0, ymm_v);
ymm0 = _mm256_maddubs_epi16(ymm0, ymm_u);
ymm0 = _mm256_hadd_epi16(ymm0, ymm1);
ymm0 = _mm256_permute4x64_epi64(ymm0, 0xd8);
ymm0 = _mm256_sub_epi16(ymm_8000, ymm0);
ymm0 = _mm256_srli_epi16(ymm0, 8);
ymm0 = _mm256_packus_epi16(ymm0, ymm0);
__m128i xmm_u = _mm256_castsi256_si128(ymm0);
__m128i xmm_v = _mm256_extracti128_si256(ymm0, 1);
_mm_storel_epi64((__m128i*)dst_u, xmm_u);
_mm_storel_epi64((__m128i*)dst_v, xmm_v);
src_argb += 64;
dst_u += 8;
dst_v += 8;
width -= 16;
}
}
#endif
#ifdef HAS_MERGEUVROW_AVX2
LIBYUV_TARGET_AVX2
void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
while (width > 0) {
__m256i ymm0 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_u));
__m256i ymm1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i*)src_v));
ymm1 = _mm256_slli_epi16(ymm1, 8);
ymm0 = _mm256_or_si256(ymm0, ymm1);
_mm256_storeu_si256((__m256i*)dst_uv, ymm0);
src_u += 16;
src_v += 16;
dst_uv += 32;
width -= 16;
}
}
#endif
#endif

View File

@ -362,35 +362,36 @@ void ScaleRowDown4Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >>
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
src_ptr[stride * 3 + 7] + 8) >>
4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
}
}
@ -399,35 +400,36 @@ void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width) {
intptr_t stride = src_stride;
int x;
for (x = 0; x < dst_width - 1; x += 2) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5] +
src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5] +
src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7] +
src_ptr[src_stride * 3 + 4] + src_ptr[src_stride * 3 + 5] +
src_ptr[src_stride * 3 + 6] + src_ptr[src_stride * 3 + 7] + 8) >>
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
src_ptr[stride * 3 + 7] + 8) >>
4;
dst += 2;
src_ptr += 8;
}
if (dst_width & 1) {
dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 8) >>
src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
src_ptr[stride * 3 + 3] + 8) >>
4;
}
}
@ -890,26 +892,27 @@ void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
src_ptr[src_stride * 2 + 7]) *
(65536 / 6) >>
16;
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536 / 9) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536 / 9) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536 / 6) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -919,26 +922,27 @@ void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
(65536u / 9u) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
(65536u / 9u) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7] + src_ptr[src_stride * 2 + 6] +
src_ptr[src_stride * 2 + 7]) *
(65536u / 6u) >>
16;
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
(65536u / 9u) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
(65536u / 9u) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
(65536u / 6u) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -949,23 +953,22 @@ void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] +
src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7]) *
(65536 / 4) >>
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
(65536 / 6) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
(65536 / 6) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536 / 4) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -975,23 +978,22 @@ void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
int dst_width) {
intptr_t stride = src_stride;
int i;
assert((dst_width % 3 == 0) && (dst_width > 0));
for (i = 0; i < dst_width; i += 3) {
dst_ptr[0] =
(src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[src_stride + 0] +
src_ptr[src_stride + 1] + src_ptr[src_stride + 2]) *
(65536u / 6u) >>
16;
dst_ptr[1] =
(src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[src_stride + 3] +
src_ptr[src_stride + 4] + src_ptr[src_stride + 5]) *
(65536u / 6u) >>
16;
dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[src_stride + 6] +
src_ptr[src_stride + 7]) *
(65536u / 4u) >>
dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
src_ptr[stride + 1] + src_ptr[stride + 2]) *
(65536u / 6u) >>
16;
dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
src_ptr[stride + 4] + src_ptr[stride + 5]) *
(65536u / 6u) >>
16;
dst_ptr[2] =
(src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
(65536u / 4u) >>
16;
src_ptr += 8;
dst_ptr += 3;
}
@ -1687,7 +1689,7 @@ void ScalePlaneVertical(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride,
InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
dst_width_bytes, yf);
dst_argb += dst_stride;
y += dy;
@ -1763,7 +1765,7 @@ void ScalePlaneVertical_16(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow(dst_argb, src_argb + yi * (ptrdiff_t)src_stride, src_stride,
InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
dst_width_words, yf);
dst_argb += dst_stride;
y += dy;
@ -1832,8 +1834,8 @@ void ScalePlaneVertical_16To8(int src_height,
}
yi = y >> 16;
yf = filtering ? ((y >> 8) & 255) : 0;
InterpolateRow_16To8(dst_argb, src_argb + yi * (ptrdiff_t)src_stride,
src_stride, scale, dst_width_words, yf);
InterpolateRow_16To8(dst_argb, src_argb + yi * src_stride, src_stride,
scale, dst_width_words, yf);
dst_argb += dst_stride;
y += dy;
}

View File

@ -183,10 +183,10 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
@ -283,10 +283,10 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEROWDOWN2_AVX2
@ -326,7 +326,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
ptrdiff_t stridex3;
intptr_t stridex3;
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
"pabsw %%xmm4,%%xmm5 \n"
@ -367,11 +367,11 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
"lea 0x8(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"=&r"(stridex3) // %3
: "r"(src_stride) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"=&r"(stridex3) // %3
: "r"((intptr_t)(src_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
@ -456,11 +456,11 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(src_stride * 3) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(src_stride * 3)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEROWDOWN4_AVX2
@ -557,11 +557,11 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"m"(kMadd21) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -625,11 +625,11 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"m"(kMadd21) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -701,10 +701,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x6(%1),%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
@ -762,10 +762,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x6(%1),%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -935,11 +935,11 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -1084,12 +1084,12 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kLinearShuffleFar) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearShuffleFar) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -1246,11 +1246,11 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
"lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -1371,12 +1371,12 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kLinearMadd31) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearMadd31) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -1497,12 +1497,12 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kLinearMadd31) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearMadd31) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -1612,12 +1612,12 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kLinearShuffleFar) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearShuffleFar) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif
@ -1746,11 +1746,11 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
@ -2016,10 +2016,10 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
@ -2030,8 +2030,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
int src_stepx,
uint8_t* dst_argb,
int dst_width) {
ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx);
ptrdiff_t src_stepx_x12;
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile(
"lea 0x00(,%1,4),%1 \n"
@ -2067,8 +2067,9 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
int src_stepx,
uint8_t* dst_argb,
int dst_width) {
ptrdiff_t src_stepx_x4 = (ptrdiff_t)(src_stepx);
ptrdiff_t src_stepx_x12;
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile(
"lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n"
@ -2101,7 +2102,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
"+r"(dst_argb), // %2
"+rm"(dst_width), // %3
"=&r"(src_stepx_x12), // %4
"+r"(src_stride) // %5
"+r"(row1) // %5
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
@ -2363,12 +2364,12 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
"lea 0x8(%1),%1 \n" // 4 UV
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEUVROWDOWN2BOX_SSSE3
@ -2404,12 +2405,12 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
@ -2530,12 +2531,12 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
"lea 0x10(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kUVLinearMadd31) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kUVLinearMadd31) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -2654,12 +2655,12 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kUVLinearMadd31) // %5
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kUVLinearMadd31) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -2798,11 +2799,11 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
"lea 0x10(%1),%1 \n" // 2 uv to 4 uv
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
@ -2929,11 +2930,11 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif

View File

@ -2827,8 +2827,9 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
int has_large_malloc = 1;
#endif
if (!has_large_malloc) {
GTEST_SKIP() << "WARNING: Large allocation may assert for "
<< (size_t)kWidth * kHeight << " bytes";
printf("WARNING: Skipped. Large allocation may assert for %zd\n",
(size_t)kWidth * kHeight);
return;
}
// Allocate one extra column so that the coalesce optimizations do not trigger
@ -2840,16 +2841,20 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
fflush(stdout);
align_buffer_page_end(orig_i400, (size_t)kWidth * kHeight);
if (!orig_i400) {
GTEST_SKIP() << "WARNING: unable to allocate I400 image of "
<< (size_t)kWidth * kHeight << " bytes";
printf("WARNING: unable to allocate I400 image of %zd bytes\n",
(size_t)kWidth * kHeight);
fflush(stdout);
return;
}
printf("INFO: allocate I400 image returned %p\n", orig_i400);
fflush(stdout);
align_buffer_page_end(dest_argb, (size_t)kWidth * kHeight * 4);
if (!dest_argb) {
printf("WARNING: unable to allocate ARGB image of %zd bytes\n",
(size_t)kWidth * kHeight * 4);
fflush(stdout);
free_aligned_buffer_page_end(orig_i400);
GTEST_SKIP() << "WARNING: unable to allocate ARGB image of "
<< (size_t)kWidth * kHeight * 4 << " bytes";
return;
}
printf("INFO: allocate ARGB image returned %p\n", dest_argb);
fflush(stdout);
@ -2867,72 +2872,4 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) {
#endif // !defined(LEAN_TESTS)
#define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y, W1280, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \
const int kWidth = W1280; \
const int kHeight = benchmark_height_; \
const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \
const int kStrideA = \
(kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \
const int kStrideY = kWidth; \
const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \
const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \
align_buffer_page_end(src_argb, \
kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \
align_buffer_page_end(dst_y_c, kStrideY* kHeight); \
align_buffer_page_end(dst_uv_c, kSizeUV); \
align_buffer_page_end(dst_y_opt, kStrideY* kHeight); \
align_buffer_page_end(dst_uv_opt, kSizeUV); \
for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \
src_argb[i + OFF] = (fastrand() & 0xff); \
} \
memset(dst_y_c, 1, kStrideY* kHeight); \
memset(dst_uv_c, 2, kSizeUV); \
memset(dst_y_opt, 101, kStrideY* kHeight); \
memset(dst_uv_opt, 102, kSizeUV); \
MaskCpuFlags(disable_cpu_flags_); \
FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY, \
dst_uv_c, kStrideUV, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_opt, \
kStrideY, dst_uv_opt, kStrideUV, kWidth, NEG kHeight); \
} \
for (int i = 0; i < kStrideY * kHeight; ++i) { \
EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
} \
for (int i = 0; i < kSizeUV; ++i) { \
EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]); \
} \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_uv_opt); \
}
#if defined(ENABLE_FULL_TESTS)
#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y) \
TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0) \
TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y, benchmark_width_, _Unaligned, +, 4) \
TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
#else
#define TESTATOBP(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y) \
TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \
SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
#endif
TESTATOBP(RAW, uint8_t, 3, 3, 1, NV21, 2, 2)
TESTATOBP(RGB24, uint8_t, 3, 3, 1, NV12, 2, 2)
TESTATOBP(RAW, uint8_t, 3, 3, 1, JNV21, 2, 2)
} // namespace libyuv

View File

@ -825,6 +825,7 @@ TESTATOBP(ARGB, 1, 4, NV12, 2, 2)
TESTATOBP(ARGB, 1, 4, NV21, 2, 2)
TESTATOBP(ABGR, 1, 4, NV12, 2, 2)
TESTATOBP(ABGR, 1, 4, NV21, 2, 2)
TESTATOBP(RAW, 1, 3, JNV21, 2, 2)
TESTATOBP(YUY2, 2, 4, NV12, 2, 2)
TESTATOBP(UYVY, 2, 4, NV12, 2, 2)
TESTATOBP(AYUV, 1, 4, NV12, 2, 2)

View File

@ -892,11 +892,6 @@ TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else
#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else
#endif
{
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,

View File

@ -8,14 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <new>
#include "../unit_test/unit_test.h"
#include "libyuv/cpu_id.h"
#include "libyuv/scale.h"
@ -43,95 +38,6 @@
namespace libyuv {
#ifdef ENABLE_ROW_TESTS
#ifdef HAS_SCALEROWDOWN2_SSSE3
TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
SIMD_ALIGNED(uint8_t dst_pixels_opt[64]);
SIMD_ALIGNED(uint8_t dst_pixels_c[64]);
memset(orig_pixels, 0, sizeof(orig_pixels));
memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt));
memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
if (!has_ssse3) {
printf("Warning SSSE3 not detected; Skipping test.\n");
} else {
// TL.
orig_pixels[0] = 255u;
orig_pixels[1] = 0u;
orig_pixels[128 + 0] = 0u;
orig_pixels[128 + 1] = 0u;
// TR.
orig_pixels[2] = 0u;
orig_pixels[3] = 100u;
orig_pixels[128 + 2] = 0u;
orig_pixels[128 + 3] = 0u;
// BL.
orig_pixels[4] = 0u;
orig_pixels[5] = 0u;
orig_pixels[128 + 4] = 50u;
orig_pixels[128 + 5] = 0u;
// BR.
orig_pixels[6] = 0u;
orig_pixels[7] = 0u;
orig_pixels[128 + 6] = 0u;
orig_pixels[128 + 7] = 20u;
// Odd.
orig_pixels[126] = 4u;
orig_pixels[127] = 255u;
orig_pixels[128 + 126] = 16u;
orig_pixels[128 + 127] = 255u;
// Test regular half size.
ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64);
EXPECT_EQ(64u, dst_pixels_c[0]);
EXPECT_EQ(25u, dst_pixels_c[1]);
EXPECT_EQ(13u, dst_pixels_c[2]);
EXPECT_EQ(5u, dst_pixels_c[3]);
EXPECT_EQ(0u, dst_pixels_c[4]);
EXPECT_EQ(133u, dst_pixels_c[63]);
// Test Odd width version - Last pixel is just 1 horizontal pixel.
ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
EXPECT_EQ(64u, dst_pixels_c[0]);
EXPECT_EQ(25u, dst_pixels_c[1]);
EXPECT_EQ(13u, dst_pixels_c[2]);
EXPECT_EQ(5u, dst_pixels_c[3]);
EXPECT_EQ(0u, dst_pixels_c[4]);
EXPECT_EQ(10u, dst_pixels_c[63]);
// Test one pixel less, should skip the last pixel.
memset(dst_pixels_c, 0, sizeof(dst_pixels_c));
ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63);
EXPECT_EQ(64u, dst_pixels_c[0]);
EXPECT_EQ(25u, dst_pixels_c[1]);
EXPECT_EQ(13u, dst_pixels_c[2]);
EXPECT_EQ(5u, dst_pixels_c[3]);
EXPECT_EQ(0u, dst_pixels_c[4]);
EXPECT_EQ(0u, dst_pixels_c[63]);
// Test regular half size SSSE3.
ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
EXPECT_EQ(64u, dst_pixels_opt[0]);
EXPECT_EQ(25u, dst_pixels_opt[1]);
EXPECT_EQ(13u, dst_pixels_opt[2]);
EXPECT_EQ(5u, dst_pixels_opt[3]);
EXPECT_EQ(0u, dst_pixels_opt[4]);
EXPECT_EQ(133u, dst_pixels_opt[63]);
// Compare C and SSSE3 match.
ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64);
ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64);
for (int i = 0; i < 64; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
}
}
#endif // HAS_SCALEROWDOWN2_SSSE3
TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]);
@ -467,71 +373,4 @@ TEST_F(LibYUVScaleTest, PlaneTest1_16_Box) {
free_aligned_buffer_page_end(dst_pixels_alloc);
free_aligned_buffer_page_end(orig_pixels_alloc);
}
// POC: int * int overflow in ScalePlaneVertical (scale_common.cc).
//
// `yi * src_stride` is evaluated as int * int. When the product exceeds
// INT_MAX it wraps negative and InterpolateRow reads from BEFORE the
// source allocation.
//
// Parameters:
// - dst_width == src_width
// -> ScalePlane dispatches to ScalePlaneVertical
// - src_height == 5, dst_height == 1
// -> single iteration with yi == 2
// - src_stride == 0x7FFFFFF8
// -> 2 * 0x7FFFFFF8 == 0xFFFFFFF0 == -16 (int)
//
// The source buffer is sized so that the *correct* 64-bit offset
// (2 * 0x7FFFFFF8 == 4294967280) plus kWidth bytes is in-bounds. With the
// bug, the 32-bit product is -16 and ASAN reports a heap-buffer-overflow
// READ "16 bytes before" the allocation.
TEST_F(LibYUVScaleTest, ScalePlaneVertical_IntStrideOverflow) {
const int kWidth = 16;
const int kSrcHeight = 5;
const int kDstHeight = 1;
const int kStride = 0x7FFFFFF8; // 2147483640
// src_size is big enough for the only row this call legitimately touches
// (yi == 2) when computed in 64-bit: 2 * stride + width = 4 GiB.
size_t src_size = kStride;
if (src_size > SIZE_MAX / 2) {
GTEST_SKIP() << "could not represent allocation size in size_t";
}
src_size *= 2;
if (src_size > SIZE_MAX - kWidth) {
GTEST_SKIP() << "could not represent allocation size in size_t";
}
src_size += kWidth;
#if defined(__aarch64__)
// Infer malloc can accept a large size for cpu with dot product (a76/a55)
int has_large_malloc = TestCpuFlag(kCpuHasNeonDotProd);
#else
int has_large_malloc = 1;
#endif
if (!has_large_malloc) {
GTEST_SKIP() << "large allocation may assert for " << src_size << " bytes";
}
uint8_t* src = new (std::nothrow) uint8_t[src_size];
if (!src) {
GTEST_SKIP() << "could not allocate " << src_size << " bytes";
}
uint8_t* dst = new uint8_t[kWidth];
memset(dst, 0, kWidth);
// Force the scalar path so the crash site is deterministic
// (InterpolateRow_C -> memcpy when yf == 0).
MaskCpuFlags(disable_cpu_flags_);
int r = ScalePlane(src, kStride, kWidth, kSrcHeight, dst, kWidth, kWidth,
kDstHeight, kFilterNone);
// Not reached under ASAN.
EXPECT_EQ(0, r);
delete[] src;
delete[] dst;
}
} // namespace libyuv