clang-tidy applied

Bug: libyuv:886, libyuv:889
Change-Id: I2d14d03c19402381256d3c6d988e0b7307bdffd8
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2800147
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2021-04-01 14:20:35 -07:00 committed by Frank Barchard
parent 34bf48e160
commit 60db98b6fa
15 changed files with 958 additions and 1006 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1784
Version: 1785
License: BSD
License File: LICENSE

View File

@ -767,7 +767,7 @@ struct YuvConstants {
#else
// This struct is for Intel color conversion.
struct YuvConstants {
#if LIBYUV_UNLIMITED_DATA
#if defined(LIBYUV_UNLIMITED_DATA)
uint8_t kUVToB[32];
uint8_t kUVToG[32];
uint8_t kUVToR[32];
@ -1063,11 +1063,11 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width);
void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width);
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@ -1262,16 +1262,16 @@ void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width);
void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
@ -1373,42 +1373,42 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -1417,7 +1417,7 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -1440,47 +1440,47 @@ void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_v,
int width);
void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -1689,7 +1689,7 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv,
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width);
void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
@ -1705,9 +1705,13 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width);
void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width);
void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width);
void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
@ -1928,23 +1932,23 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_a,
uint8_t* dst_argb,
int width);
void MergeARGBRow_Any_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
void MergeARGBRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
const uint8_t* a_buf,
uint8_t* dst_ptr,
int width);
void MergeARGBRow_Any_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
void MergeARGBRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
const uint8_t* a_buf,
uint8_t* dst_ptr,
int width);
void MergeARGBRow_Any_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
const uint8_t* src_a,
uint8_t* dst_argb,
void MergeARGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
const uint8_t* a_buf,
uint8_t* dst_ptr,
int width);
void SplitARGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
@ -1970,31 +1974,31 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_NEON(const uint8_t* src_argb,
void SplitARGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_SSE2(const uint8_t* src_argb,
void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_SSSE3(const uint8_t* src_argb,
void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_AVX2(const uint8_t* src_argb,
void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
uint8_t* dst_a,
int width);
void SplitARGBRow_Any_NEON(const uint8_t* src_argb,
void SplitARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
@ -2020,20 +2024,20 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_argb,
int width);
void MergeXRGBRow_Any_SSE2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
void MergeXRGBRow_Any_AVX2(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
void MergeXRGBRow_Any_NEON(const uint8_t* src_r,
const uint8_t* src_g,
const uint8_t* src_b,
uint8_t* dst_argb,
void MergeXRGBRow_Any_NEON(const uint8_t* y_buf,
const uint8_t* u_buf,
const uint8_t* v_buf,
uint8_t* dst_ptr,
int width);
void SplitXRGBRow_C(const uint8_t* src_argb,
uint8_t* dst_r,
@ -2055,27 +2059,27 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_NEON(const uint8_t* src_argb,
void SplitXRGBRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_SSE2(const uint8_t* src_argb,
void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_SSSE3(const uint8_t* src_argb,
void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_AVX2(const uint8_t* src_argb,
void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
int width);
void SplitXRGBRow_Any_NEON(const uint8_t* src_argb,
void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_r,
uint8_t* dst_g,
uint8_t* dst_b,
@ -2183,74 +2187,74 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
uint8_t* dst_argb,
int depth,
int width);
void MergeXR30Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
void MergeXR30Row_Any_AVX2(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint8_t* dst_ptr,
int depth,
int width);
void MergeAR64Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
void MergeAR64Row_Any_AVX2(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
const uint16_t* a_buf,
uint16_t* dst_ptr,
int depth,
int width);
void MergeXR64Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
void MergeXR64Row_Any_AVX2(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint16_t* dst_ptr,
int depth,
int width);
void MergeARGB16To8Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
const uint16_t* a_buf,
uint8_t* dst_ptr,
int depth,
int width);
void MergeXRGB16To8Row_Any_AVX2(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint8_t* dst_ptr,
int depth,
int width);
void MergeXR30Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
void MergeXR30Row_Any_NEON(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint8_t* dst_ptr,
int depth,
int width);
void MergeXR30Row_10_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_ar30,
void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint8_t* dst_ptr,
int depth,
int width);
void MergeAR64Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint16_t* dst_ar64,
void MergeAR64Row_Any_NEON(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
const uint16_t* a_buf,
uint16_t* dst_ptr,
int depth,
int width);
void MergeARGB16To8Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
const uint16_t* src_a,
uint8_t* dst_argb,
void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
const uint16_t* a_buf,
uint8_t* dst_ptr,
int depth,
int width);
void MergeXR64Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint16_t* dst_ar64,
void MergeXR64Row_Any_NEON(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint16_t* dst_ptr,
int depth,
int width);
void MergeXRGB16To8Row_Any_NEON(const uint16_t* src_r,
const uint16_t* src_g,
const uint16_t* src_b,
uint8_t* dst_argb,
void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf,
const uint16_t* g_buf,
const uint16_t* b_buf,
uint8_t* dst_ptr,
int depth,
int width);
@ -2314,16 +2318,16 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width);
void MultiplyRow_16_Any_AVX2(const uint16_t* src_y,
uint16_t* dst_y,
void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int scale,
int width);
void MultiplyRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width);
void MultiplyRow_16_Any_NEON(const uint16_t* src_y,
uint16_t* dst_y,
void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int scale,
int width);
@ -2335,16 +2339,16 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width);
void DivideRow_16_Any_AVX2(const uint16_t* src_y,
uint16_t* dst_y,
void DivideRow_16_Any_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int scale,
int width);
void DivideRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width);
void DivideRow_16_Any_NEON(const uint16_t* src_y,
uint16_t* dst_y,
void DivideRow_16_Any_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int scale,
int width);
@ -3719,15 +3723,15 @@ void I400ToARGBRow_MMI(const uint8_t* src_y,
int width);
void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
const struct YuvConstants* param,
int width);
void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
const struct YuvConstants* param,
int width);
void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
const struct YuvConstants* param,
int width);
void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr,
uint8_t* dst_ptr,
@ -3739,11 +3743,11 @@ void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr,
int width);
// ARGB preattenuated alpha blend.
void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
void ARGBBlendRow_NEON(const uint8_t* src_argb0,
void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3755,7 +3759,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
void ARGBBlendRow_C(const uint8_t* src_argb0,
void ARGBBlendRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3799,11 +3803,11 @@ void BlendPlaneRow_C(const uint8_t* src0,
// ARGB multiply images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
void ARGBMultiplyRow_C(const uint8_t* src_argb0,
void ARGBMultiplyRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3811,7 +3815,7 @@ void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3819,7 +3823,7 @@ void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3845,11 +3849,11 @@ void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf,
int width);
// ARGB add images.
void ARGBAddRow_C(const uint8_t* src_argb0,
void ARGBAddRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
void ARGBAddRow_SSE2(const uint8_t* src_argb0,
void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3857,7 +3861,7 @@ void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void ARGBAddRow_AVX2(const uint8_t* src_argb0,
void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3865,7 +3869,7 @@ void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void ARGBAddRow_NEON(const uint8_t* src_argb0,
void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3892,11 +3896,11 @@ void ARGBAddRow_Any_MMI(const uint8_t* y_buf,
// ARGB subtract images. Same API as Blend, but these require
// pointer and width alignment for SSE2.
void ARGBSubtractRow_C(const uint8_t* src_argb0,
void ARGBSubtractRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3904,7 +3908,7 @@ void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -3912,7 +3916,7 @@ void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width);
@ -4119,9 +4123,9 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
uint8_t* dst_ptr,
const struct YuvConstants* yuvconstants,
int width);
void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
uint8_t* dst_ptr,
int width);
void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
const uint8_t* uv_buf,
@ -4323,7 +4327,7 @@ void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
int width);
void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -4333,7 +4337,7 @@ void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
int width);
void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -4343,7 +4347,7 @@ void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
int width);
void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -4444,7 +4448,7 @@ void UYVYToUV422Row_C(const uint8_t* src_uyvy,
int width);
void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -4454,7 +4458,7 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
int width);
void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -4464,7 +4468,7 @@ void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
int width);
void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
@ -4501,29 +4505,29 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_C(const uint8_t* src_ayuv,
int stride_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_C(const uint8_t* src_ayuv,
int stride_ayuv,
int src_stride_ayuv,
uint8_t* dst_vu,
int width);
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
int src_stride_ayuv,
uint8_t* dst_vu,
int width);
void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_uv,
void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_vu,
int width);
void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
void AYUVToVURow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_vu,
int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1784
#define LIBYUV_VERSION 1785
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -17,36 +17,6 @@ namespace libyuv {
extern "C" {
#endif
#if ORIGINAL_OPT
uint32_t HammingDistance_C1(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff = 0u;
int i;
for (i = 0; i < count; ++i) {
int x = src_a[i] ^ src_b[i];
if (x & 1)
++diff;
if (x & 2)
++diff;
if (x & 4)
++diff;
if (x & 8)
++diff;
if (x & 16)
++diff;
if (x & 32)
++diff;
if (x & 64)
++diff;
if (x & 128)
++diff;
}
return diff;
}
#endif
// Hakmem method for hamming distance.
uint32_t HammingDistance_C(const uint8_t* src_a,
const uint8_t* src_b,

View File

@ -212,11 +212,23 @@ ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 4, 7)
#endif
#ifdef HAS_MERGEARGB16TO8ROW_AVX2
ANY41PT(MergeARGB16To8Row_Any_AVX2, MergeARGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
ANY41PT(MergeARGB16To8Row_Any_AVX2,
MergeARGB16To8Row_AVX2,
uint16_t,
2,
uint8_t,
4,
15)
#endif
#ifdef HAS_MERGEARGB16TO8ROW_NEON
ANY41PT(MergeARGB16To8Row_Any_NEON, MergeARGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7)
ANY41PT(MergeARGB16To8Row_Any_NEON,
MergeARGB16To8Row_NEON,
uint16_t,
2,
uint8_t,
4,
7)
#endif
#undef ANY41PT
@ -487,7 +499,13 @@ ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
#ifdef HAS_MERGEXR30ROW_NEON
ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3)
ANY31PT(MergeXR30Row_10_Any_NEON, MergeXR30Row_10_NEON, uint16_t, 2, uint8_t, 4, 3)
ANY31PT(MergeXR30Row_10_Any_NEON,
MergeXR30Row_10_NEON,
uint16_t,
2,
uint8_t,
4,
3)
#endif
#ifdef HAS_MERGEXR64ROW_AVX2
@ -499,11 +517,23 @@ ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 4, 7)
#endif
#ifdef HAS_MERGEXRGB16TO8ROW_AVX2
ANY31PT(MergeXRGB16To8Row_Any_AVX2, MergeXRGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15)
ANY31PT(MergeXRGB16To8Row_Any_AVX2,
MergeXRGB16To8Row_AVX2,
uint16_t,
2,
uint8_t,
4,
15)
#endif
#ifdef HAS_MERGEXRGB16TO8ROW_NEON
ANY31PT(MergeXRGB16To8Row_Any_NEON, MergeXRGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7)
ANY31PT(MergeXRGB16To8Row_Any_NEON,
MergeXRGB16To8Row_NEON,
uint16_t,
2,
uint8_t,
4,
7)
#endif
#undef ANY31PT
@ -1553,20 +1583,20 @@ ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7)
#undef ANY11C
// Any 1 to 1 interpolate. Takes 2 rows of source via stride.
#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \
ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
SIMD_ALIGNED(uint8_t temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \
int width, int source_y_fraction) { \
SIMD_ALIGNED(uint8_t temp[64 * 3]); \
memset(temp, 0, 64 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \
} \
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP); \
ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \
memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \
}
#ifdef HAS_INTERPOLATEROW_AVX2
@ -1844,17 +1874,17 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15)
// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \
void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \
uint8_t* dst_v, int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 4]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \
ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
@ -2001,17 +2031,17 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane.
// 128 byte row allows for 32 avx ARGB pixels.
#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \
void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \
void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \
int width) { \
SIMD_ALIGNED(uint8_t temp[128 * 3]); \
memset(temp, 0, 128 * 2); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \
ANY_SIMD(src_ptr, src_stride, dst_vu, n); \
} \
memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \
memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \
SS(r, UVSHIFT) * BPP); \
if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \

View File

@ -553,80 +553,80 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
// Intel version mimic SSE/AVX which does 2 pavgb
#if LIBYUV_ARGBTOUV_PAVGB
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
#else
// ARM version does sum / 2 then multiply by 2x smaller coefficients
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
src_rgb1[B + BPP] + 1) >> \
1; \
uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
src_rgb1[G + BPP] + 1) >> \
1; \
uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
src_rgb1[R + BPP] + 1) >> \
1; \
dst_u[0] = RGB2xToU(ar, ag, ab); \
dst_v[0] = RGB2xToV(ar, ag, ab); \
src_rgb0 += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint16_t ab = src_rgb0[B] + src_rgb1[B]; \
uint16_t ag = src_rgb0[G] + src_rgb1[G]; \
uint16_t ar = src_rgb0[R] + src_rgb1[R]; \
dst_u[0] = RGB2xToU(ar, ag, ab); \
dst_v[0] = RGB2xToV(ar, ag, ab); \
} \
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \
src_rgb1[B + BPP] + 1) >> \
1; \
uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \
src_rgb1[G + BPP] + 1) >> \
1; \
uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \
src_rgb1[R + BPP] + 1) >> \
1; \
dst_u[0] = RGB2xToU(ar, ag, ab); \
dst_v[0] = RGB2xToV(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint16_t ab = src_rgb[B] + src_rgb1[B]; \
uint16_t ag = src_rgb[G] + src_rgb1[G]; \
uint16_t ar = src_rgb[R] + src_rgb1[R]; \
dst_u[0] = RGB2xToU(ar, ag, ab); \
dst_v[0] = RGB2xToV(ar, ag, ab); \
} \
}
#endif
@ -694,80 +694,80 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) {
// ARGBToYJ_C and ARGBToUVJ_C
// Intel version mimic SSE/AVX which does 2 pavgb
#if LIBYUV_ARGBTOUV_PAVGB
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
src_rgb0 += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \
AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \
AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \
AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \
dst_u[0] = RGBToUJ(ar, ag, ab); \
dst_v[0] = RGBToVJ(ar, ag, ab); \
} \
}
#else
// ARM version does sum / 2 then multiply by 2x smaller coefficients
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
src_rgb1[B + BPP] + 1) >> \
1; \
uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
src_rgb1[G + BPP] + 1) >> \
1; \
uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
src_rgb1[R + BPP] + 1) >> \
1; \
dst_u[0] = RGB2xToUJ(ar, ag, ab); \
dst_v[0] = RGB2xToVJ(ar, ag, ab); \
src_rgb0 += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \
uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \
uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \
dst_u[0] = RGB2xToUJ(ar, ag, ab); \
dst_v[0] = RGB2xToVJ(ar, ag, ab); \
} \
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \
src_rgb += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \
src_rgb1[B + BPP] + 1) >> \
1; \
uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \
src_rgb1[G + BPP] + 1) >> \
1; \
uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \
src_rgb1[R + BPP] + 1) >> \
1; \
dst_u[0] = RGB2xToUJ(ar, ag, ab); \
dst_v[0] = RGB2xToVJ(ar, ag, ab); \
src_rgb += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint16_t ab = (src_rgb[B] + src_rgb1[B]); \
uint16_t ag = (src_rgb[G] + src_rgb1[G]); \
uint16_t ar = (src_rgb[R] + src_rgb1[R]); \
dst_u[0] = RGB2xToUJ(ar, ag, ab); \
dst_v[0] = RGB2xToVJ(ar, ag, ab); \
} \
}
#endif
@ -1237,16 +1237,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb,
#define REPEAT8(v) (v) | ((v) << 8)
#define SHADE(f, v) v* f >> 16
void ARGBMultiplyRow_C(const uint8_t* src_argb0,
void ARGBMultiplyRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t b = REPEAT8(src_argb0[0]);
const uint32_t g = REPEAT8(src_argb0[1]);
const uint32_t r = REPEAT8(src_argb0[2]);
const uint32_t a = REPEAT8(src_argb0[3]);
const uint32_t b = REPEAT8(src_argb[0]);
const uint32_t g = REPEAT8(src_argb[1]);
const uint32_t r = REPEAT8(src_argb[2]);
const uint32_t a = REPEAT8(src_argb[3]);
const uint32_t b_scale = src_argb1[0];
const uint32_t g_scale = src_argb1[1];
const uint32_t r_scale = src_argb1[2];
@ -1255,7 +1255,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
dst_argb[1] = SHADE(g, g_scale);
dst_argb[2] = SHADE(r, r_scale);
dst_argb[3] = SHADE(a, a_scale);
src_argb0 += 4;
src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@ -1265,16 +1265,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0,
#define SHADE(f, v) clamp255(v + f)
void ARGBAddRow_C(const uint8_t* src_argb0,
void ARGBAddRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
const int b = src_argb0[0];
const int g = src_argb0[1];
const int r = src_argb0[2];
const int a = src_argb0[3];
const int b = src_argb[0];
const int g = src_argb[1];
const int r = src_argb[2];
const int a = src_argb[3];
const int b_add = src_argb1[0];
const int g_add = src_argb1[1];
const int r_add = src_argb1[2];
@ -1283,7 +1283,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
dst_argb[1] = SHADE(g, g_add);
dst_argb[2] = SHADE(r, r_add);
dst_argb[3] = SHADE(a, a_add);
src_argb0 += 4;
src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@ -1292,16 +1292,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0,
#define SHADE(f, v) clamp0(f - v)
void ARGBSubtractRow_C(const uint8_t* src_argb0,
void ARGBSubtractRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int i;
for (i = 0; i < width; ++i) {
const int b = src_argb0[0];
const int g = src_argb0[1];
const int r = src_argb0[2];
const int a = src_argb0[3];
const int b = src_argb[0];
const int g = src_argb[1];
const int r = src_argb[2];
const int a = src_argb[3];
const int b_sub = src_argb1[0];
const int g_sub = src_argb1[1];
const int r_sub = src_argb1[2];
@ -1310,7 +1310,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0,
dst_argb[1] = SHADE(g, g_sub);
dst_argb[2] = SHADE(r, r_sub);
dst_argb[3] = SHADE(a, a_sub);
src_argb0 += 4;
src_argb += 4;
src_argb1 += 4;
dst_argb += 4;
}
@ -1486,7 +1486,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
#if defined(LIBYUV_UNLIMITED_DATA)
#if LIBYUV_UNLIMITED_DATA
#define UB 129 /* round(2.018 * 64) */
#else
#define UB 128 /* max(128, round(2.018 * 64)) */
@ -1540,7 +1540,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
// KR = 0.2126, KB = 0.0722
// U and V contributions to R,G,B.
#if defined(LIBYUV_UNLIMITED_DATA)
#if LIBYUV_UNLIMITED_DATA
#define UB 135 /* round(2.112 * 64) */
#else
#define UB 128 /* max(128, round(2.112 * 64)) */
@ -1594,7 +1594,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
// KR = 0.2627; KB = 0.0593
// U and V contributions to R,G,B.
#if defined(LIBYUV_UNLIMITED_DATA)
#if LIBYUV_UNLIMITED_DATA
#define UB 137 /* round(2.142 * 64) */
#else
#define UB 128 /* max(128, round(2.142 * 64)) */
@ -1646,7 +1646,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef MAKEYUVCONSTANTS
#if defined(LIBYUV_UNLIMITED_DATA)
#if LIBYUV_UNLIMITED_DATA
// C reference code that mimics the YUV assembly.
// Reads 8 bit YUV and leaves result as 16 bit.
@ -3347,19 +3347,19 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
// Blend src_argb over src_argb1 and store to dst_argb.
// dst_argb may be src_argb or src_argb1.
// This code mimics the SSSE3 version for better testability.
void ARGBBlendRow_C(const uint8_t* src_argb0,
void ARGBBlendRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
uint32_t fb = src_argb0[0];
uint32_t fg = src_argb0[1];
uint32_t fr = src_argb0[2];
uint32_t a = src_argb0[3];
uint32_t fb = src_argb[0];
uint32_t fg = src_argb[1];
uint32_t fr = src_argb[2];
uint32_t a = src_argb[3];
uint32_t bb = src_argb1[0];
uint32_t bg = src_argb1[1];
uint32_t br = src_argb1[2];
@ -3368,10 +3368,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
dst_argb[2] = BLEND(fr, br, a);
dst_argb[3] = 255u;
fb = src_argb0[4 + 0];
fg = src_argb0[4 + 1];
fr = src_argb0[4 + 2];
a = src_argb0[4 + 3];
fb = src_argb[4 + 0];
fg = src_argb[4 + 1];
fr = src_argb[4 + 2];
a = src_argb[4 + 3];
bb = src_argb1[4 + 0];
bg = src_argb1[4 + 1];
br = src_argb1[4 + 2];
@ -3379,16 +3379,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0,
dst_argb[4 + 1] = BLEND(fg, bg, a);
dst_argb[4 + 2] = BLEND(fr, br, a);
dst_argb[4 + 3] = 255u;
src_argb0 += 8;
src_argb += 8;
src_argb1 += 8;
dst_argb += 8;
}
if (width & 1) {
uint32_t fb = src_argb0[0];
uint32_t fg = src_argb0[1];
uint32_t fr = src_argb0[2];
uint32_t a = src_argb0[3];
uint32_t fb = src_argb[0];
uint32_t fg = src_argb[1];
uint32_t fr = src_argb[2];
uint32_t a = src_argb[3];
uint32_t bb = src_argb1[0];
uint32_t bg = src_argb1[1];
uint32_t br = src_argb1[2];

View File

@ -1160,7 +1160,7 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
: "memory", "cc", "xmm0", "xmm1");
}
void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64,
void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
uint8_t* dst_argb,
int width) {
asm volatile(
@ -1178,7 +1178,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64,
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_ar64), // %0
: "+r"(src_ab64), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kShuffleARGBToABGR) // %3
@ -1267,7 +1267,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
#endif
#ifdef HAS_AB64TOARGBROW_AVX2
void AB64ToARGBRow_AVX2(const uint16_t* src_ar64,
void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
uint8_t* dst_argb,
int width) {
asm volatile(
@ -1286,7 +1286,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ar64,
"lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_ar64), // %0
: "+r"(src_ab64), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kShuffleARGBToABGR) // %3
@ -1506,7 +1506,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
#endif // HAS_RGBATOYJROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1558,7 +1558,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -1575,7 +1575,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1623,7 +1623,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -1638,7 +1638,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ABGRTOUVROW_AVX2
void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1686,7 +1686,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr0), // %0
: "+r"(src_abgr), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -1701,7 +1701,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
#endif // HAS_ABGRTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1750,7 +1750,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -1765,7 +1765,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVJROW_AVX2
#ifdef HAS_ARGBTOUVJROW_SSSE3
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1818,7 +1818,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -1905,7 +1905,7 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"xmm7");
}
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1957,7 +1957,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_bgra0), // %0
: "+r"(src_bgra), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -2002,7 +2002,7 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"xmm7");
}
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
@ -2054,7 +2054,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_abgr0), // %0
: "+r"(src_abgr), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -2065,7 +2065,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
}
void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
@ -2117,7 +2117,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
"lea 0x8(%1),%1 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_rgba0), // %0
: "+r"(src_rgba), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
@ -5741,7 +5741,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
#if defined(__i386__)
: "m"(shift) // %5
#else
: "rm"(shift) // %5
: "rm"(shift) // %5
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
@ -5813,9 +5813,9 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
: "m"(shift), // %6
"m"(mask), // %7
#else
"+rm"(width) // %5
: "rm"(shift), // %6
"rm"(mask), // %7
"+rm"(width) // %5
: "rm"(shift), // %6
"rm"(mask), // %7
#endif
"m"(MergeAR64Permute) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
@ -5882,8 +5882,8 @@ void MergeXR64Row_AVX2(const uint16_t* src_r,
: "m"(shift), // %5
"m"(mask), // %6
#else
: "rm"(shift), // %5
"rm"(mask), // %6
: "rm"(shift), // %5
"rm"(mask), // %6
#endif
"m"(MergeAR64Permute) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
@ -5944,8 +5944,8 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
"+m"(width) // %5
: "m"(shift), // %6
#else
"+rm"(width) // %5
: "rm"(shift), // %6
"+rm"(width) // %5
: "rm"(shift), // %6
#endif
"m"(MergeARGB16To8Shuffle) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
@ -6000,7 +6000,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
#if defined(__i386__)
: "m"(shift), // %5
#else
: "rm"(shift), // %5
: "rm"(shift), // %5
#endif
"m"(MergeARGB16To8Shuffle) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
@ -6732,7 +6732,7 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time
void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -6803,7 +6803,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
"sub $0x1,%3 \n"
"jge 91b \n"
"99: \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -7405,7 +7405,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -7433,7 +7433,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
"lea 0x10(%2),%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -7444,7 +7444,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -7471,7 +7471,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
"sub $0x8,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -7482,7 +7482,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
void ARGBAddRow_SSE2(const uint8_t* src_argb0,
void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -7499,7 +7499,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
"lea 0x10(%2),%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -7510,7 +7510,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
void ARGBAddRow_AVX2(const uint8_t* src_argb0,
void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -7527,7 +7527,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
"sub $0x8,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -7538,7 +7538,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -7555,7 +7555,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
"lea 0x10(%2),%2 \n"
"sub $0x4,%3 \n"
"jg 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -7566,7 +7566,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -7583,7 +7583,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
"sub $0x8,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3

File diff suppressed because it is too large Load Diff

View File

@ -781,7 +781,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
}
}
void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
@ -792,10 +792,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
@ -822,18 +822,18 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
src_argb += 64;
dst_y += 16;
}
}
void ARGBToUVRow_MSA(const uint8_t* src_argb0,
void ARGBToUVRow_MSA(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
const uint8_t* src_argb_next = src_argb + src_stride_argb;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
@ -847,14 +847,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001);
for (x = 0; x < width; x += 32) {
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48);
src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64);
src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80);
src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96);
src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@ -875,14 +875,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
reg3 = __msa_hadd_u_h(vec5, vec5);
reg4 = __msa_hadd_u_h(vec0, vec0);
reg5 = __msa_hadd_u_h(vec1, vec1);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0);
src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16);
src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32);
src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48);
src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64);
src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80);
src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96);
src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112);
vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
@ -945,8 +945,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0,
dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
ST_UB(dst0, dst_u);
ST_UB(dst1, dst_v);
src_argb0 += 128;
src_argb0_next += 128;
src_argb += 128;
src_argb_next += 128;
dst_u += 16;
dst_v += 16;
}
@ -1173,7 +1173,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
}
}
void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
void ARGBMultiplyRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -1184,7 +1184,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
v8i16 zero = {0};
for (x = 0; x < width; x += 4) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
@ -1206,13 +1206,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_argb);
src_argb0 += 16;
src_argb += 16;
src_argb1 += 16;
dst_argb += 16;
}
}
void ARGBAddRow_MSA(const uint8_t* src_argb0,
void ARGBAddRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -1220,20 +1220,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0,
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_adds_u_b(src0, src2);
dst1 = __msa_adds_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
}
void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
void ARGBSubtractRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -1241,14 +1241,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
v16u8 src0, src1, src2, src3, dst0, dst1;
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
dst0 = __msa_subs_u_b(src0, src2);
dst1 = __msa_subs_u_b(src1, src3);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}
@ -1794,7 +1794,7 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
}
}
void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@ -1809,9 +1809,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@ -1830,12 +1830,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
src_argb0 += 48;
src_argb += 48;
dst_y += 16;
}
}
void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
v8u16 vec0, vec1, vec2, vec3;
@ -1850,9 +1850,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v16i8 zero = {0};
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
@ -1871,7 +1871,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
ST_UB(dst0, dst_y);
src_argb0 += 48;
src_argb += 48;
dst_y += 16;
}
}
@ -2037,14 +2037,14 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
}
}
void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
const uint8_t* s = src_rgb;
const uint8_t* t = src_rgb + src_stride_rgb;
int64_t res0, res1;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
@ -2147,14 +2147,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
}
}
void RAWToUVRow_MSA(const uint8_t* src_rgb0,
void RAWToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
const uint8_t* s = src_rgb;
const uint8_t* t = src_rgb + src_stride_rgb;
int64_t res0, res1;
v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
@ -2446,7 +2446,7 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx,
}
}
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D);
@ -2454,19 +2454,19 @@ void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8,
dst0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
src_argb += 64;
dst_y += 16;
}
}
void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
@ -2474,19 +2474,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
src_argb += 64;
dst_y += 16;
}
}
void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
@ -2494,19 +2494,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
src_argb += 64;
dst_y += 16;
}
}
void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) {
int x;
v16u8 src0, src1, src2, src3, dst0;
v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
@ -2514,26 +2514,26 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
for (x = 0; x < width; x += 16) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb, 32);
src3 = (v16u8)__msa_ld_b((void*)src_argb, 48);
ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
dst0);
ST_UB(dst0, dst_y);
src_argb0 += 64;
src_argb += 64;
dst_y += 16;
}
}
void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
const uint8_t* s = src_rgb;
const uint8_t* t = src_rgb + src_stride_rgb;
v8u16 src0, src1, src2, src3, src4, src5, src6, src7;
v8u16 vec0, vec1, vec2, vec3;
v8u16 dst0, dst1, dst2, dst3;
@ -2658,14 +2658,14 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
}
}
void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
void BGRAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
const uint8_t* s = src_rgb;
const uint8_t* t = src_rgb + src_stride_rgb;
const uint8_t unused = 0xf;
v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
@ -2693,14 +2693,14 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
}
}
void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
void ABGRToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
const uint8_t* s = src_rgb;
const uint8_t* t = src_rgb + src_stride_rgb;
const uint8_t unused = 0xf;
v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
@ -2728,14 +2728,14 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
}
}
void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
void RGBAToUVRow_MSA(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int x;
const uint8_t* s = src_rgb0;
const uint8_t* t = src_rgb0 + src_stride_rgb;
const uint8_t* s = src_rgb;
const uint8_t* t = src_rgb + src_stride_rgb;
const uint8_t unused = 0xf;
v8u16 src0, src1, src2, src3;
v16u8 dst0, dst1;
@ -3109,7 +3109,7 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
}
}
void ARGBBlendRow_MSA(const uint8_t* src_argb0,
void ARGBBlendRow_MSA(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -3123,8 +3123,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
v16i8 zero = {0};
for (x = 0; x < width; x += 8) {
src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16);
src0 = (v16u8)__msa_ld_b((void*)src_argb, 0);
src1 = (v16u8)__msa_ld_b((void*)src_argb, 16);
src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0);
src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16);
vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
@ -3168,7 +3168,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0,
dst0 = __msa_bmnz_v(dst0, const_255, mask);
dst1 = __msa_bmnz_v(dst1, const_255, mask);
ST_UB2(dst0, dst1, dst_argb, 16);
src_argb0 += 32;
src_argb += 32;
src_argb1 += 32;
dst_argb += 32;
}

View File

@ -415,11 +415,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READNV12 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
@ -438,11 +438,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READNV21 YUVTORGB
"subs %3, %3, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(src_vu), // %1
"+r"(dst_argb), // %2
@ -537,11 +537,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READYUY2 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@ -558,11 +558,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
"vmov.u8 d23, #255 \n"
"1: \n" READUYVY YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
@ -1680,7 +1680,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
: "cc", "memory", "q0", "q1", "q2", "q12", "q13");
}
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
@ -1694,7 +1694,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
@ -2655,7 +2655,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8_t* src_argb0,
void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -2706,7 +2706,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
"99: \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -2944,7 +2944,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
}
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -2964,7 +2964,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -2973,7 +2973,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBAddRow_NEON(const uint8_t* src_argb0,
void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -2987,7 +2987,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
"vqadd.u8 q1, q1, q3 \n" // add R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -2996,7 +2996,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -3010,7 +3010,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3

View File

@ -909,7 +909,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
"+r"(src_b), // %2
"+r"(dst_ar30), // %3
"+r"(width) // %4
: "r"(shift) // %5
: "r"(shift) // %5
: "memory", "cc", "v0", "v1", "v2", "v30", "v31");
}
@ -1305,10 +1305,10 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"movi v5.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v3.8b, v1.8b, v1.8b \n" // move g
"prfm pldl1keep, [%0, 448] \n"
"orr v4.8b, v0.8b, v0.8b \n" // move r
"orr v4.8b, v0.8b, v0.8b \n" // move r
"st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
"b.gt 1b \n"
: "+r"(src_raw), // %0
@ -1324,10 +1324,10 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
"movi v0.8b, #255 \n" // Alpha
"1: \n"
"ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v2.8b, v4.8b, v4.8b \n" // move g
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v2.8b, v4.8b, v4.8b \n" // move g
"prfm pldl1keep, [%0, 448] \n"
"orr v1.8b, v5.8b, v5.8b \n" // move r
"orr v1.8b, v5.8b, v5.8b \n" // move r
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
"b.gt 1b \n"
: "+r"(src_raw), // %0
@ -1377,8 +1377,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
RGB565TOARGB
"prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
@ -1467,8 +1466,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
ARGB4444TOARGB
"prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
@ -1485,7 +1483,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
asm volatile(
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
"st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
// RGB24
@ -1502,8 +1500,8 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile(
"1: \n"
"ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"orr v4.8b, v2.8b, v2.8b \n" // mov g
"prfm pldl1keep, [%0, 448] \n"
"orr v5.8b, v1.8b, v1.8b \n" // mov b
"st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
@ -1676,7 +1674,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
asm volatile(
"1: \n"
"ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
"subs %w4, %w4, #16 \n" // 16 pixels
"subs %w4, %w4, #16 \n" // 16 pixels
"orr v2.8b, v1.8b, v1.8b \n"
"prfm pldl1keep, [%0, 448] \n"
"ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
@ -1724,8 +1722,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
ARGBTORGB565
"prfm pldl1keep, [%0, 448] \n" ARGBTORGB565
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -1766,8 +1763,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
ARGBTOARGB1555
"prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -1787,8 +1783,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
"ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8
// pixels
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"prfm pldl1keep, [%0, 448] \n"
ARGBTOARGB4444
"prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444
"st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -1956,7 +1951,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"movi v4.8b, #29 \n" // B * 0.1140 coefficient
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
@ -1971,7 +1966,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n"
: "+r"(src_argb), // %0
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
@ -2668,8 +2663,8 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
@ -2692,8 +2687,8 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
"movi v7.8b, #16 \n" // Add 16 constant
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v16.8h, v0.8b, v4.8b \n" // B
"prfm pldl1keep, [%0, 448] \n"
"umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R
@ -2715,8 +2710,8 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // B
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // B
"prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v1.8b, v5.8b \n" // G
"umlal v0.8h, v2.8b, v6.8b \n" // R
@ -2737,8 +2732,8 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
"movi v4.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // B
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v0.8b, v4.8b \n" // B
"prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v1.8b, v5.8b \n" // G
"umlal v0.8h, v2.8b, v6.8b \n" // R
@ -2818,7 +2813,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8_t* src_argb0,
void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -2880,7 +2875,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0,
"99: \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -2900,11 +2895,11 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v3.8b \n" // b * a
"prfm pldl1keep, [%0, 448] \n"
"umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a
"uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
"umull v5.8h, v1.8b, v3.8b \n" // g * a
"umull v6.8h, v2.8b, v3.8b \n" // r * a
"uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
"uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
"uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb), // %0
@ -2930,8 +2925,8 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
// 8 pixel loop.
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"prfm pldl1keep, [%0, 448] \n"
"uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n"
@ -3040,8 +3035,8 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
"movi v30.8b, #50 \n" // BR coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"prfm pldl1keep, [%0, 448] \n"
"umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R
@ -3127,7 +3122,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -3149,7 +3144,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
"rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -3158,7 +3153,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
}
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
void ARGBAddRow_NEON(const uint8_t* src_argb0,
void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -3176,7 +3171,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
"uqadd v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -3185,7 +3180,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
}
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
@ -3203,7 +3198,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
"uqsub v3.8b, v3.8b, v7.8b \n"
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
"b.gt 1b \n"
: "+r"(src_argb0), // %0
: "+r"(src_argb), // %0
"+r"(src_argb1), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
@ -3703,9 +3698,9 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
"prfm pldl1keep, [%0, 448] \n"
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
"prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #16 \n" // 16 pixels per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels

View File

@ -1427,7 +1427,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
}
}
__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1573,7 +1573,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
}
#ifdef HAS_ARGBTOUVROW_AVX2
__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1641,7 +1641,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
#endif // HAS_ARGBTOUVROW_AVX2
#ifdef HAS_ARGBTOUVJROW_AVX2
__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1709,7 +1709,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
}
#endif // HAS_ARGBTOUVJROW_AVX2
__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
@ -1767,7 +1767,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
}
}
__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1839,7 +1839,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -1911,7 +1911,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
}
}
__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4347,13 +4347,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
// Blend 8 pixels at a time.
__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@ -4442,7 +4442,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
pcmpeqb xmm3, xmm3 // generate mask 0xff000000
@ -4487,7 +4487,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@ -4581,7 +4581,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
__asm {
mov eax, [esp + 4] // src_argb0
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
sub edx, eax
@ -4937,20 +4937,20 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
#ifdef HAS_ARGBMULTIPLYROW_SSE2
// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm0, [eax] // read 4 pixels from src_argb
movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
@ -4958,8 +4958,8 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2
pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
@ -4977,13 +4977,13 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_SSE2
// Add 2 rows of ARGB pixels together, 4 pixels at a time.
// TODO(fbarchard): Port this to posix, neon and other math functions.
__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@ -4992,11 +4992,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop49
convertloop4:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
paddusb xmm0, xmm1 // src_argb + src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@ -5007,11 +5007,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
jl convertloop19
convertloop1:
movd xmm0, [eax] // read 1 pixels from src_argb0
movd xmm0, [eax] // read 1 pixels from src_argb
lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb0 + src_argb1
paddusb xmm0, xmm1 // src_argb + src_argb1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@ -5026,23 +5026,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_SSE2
// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb0
movdqu xmm0, [eax] // read 4 pixels from src_argb
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb0 - src_argb1
psubusb xmm0, xmm1 // src_argb - src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@ -5056,20 +5056,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
#ifdef HAS_ARGBMULTIPLYROW_AVX2
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
vmovdqu ymm1, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
lea esi, [esi + 32]
@ -5077,8 +5077,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
vpunpckhbw ymm1, ymm1, ymm1 // high 4
vpunpcklbw ymm2, ymm3, ymm5 // low 4
vpunpckhbw ymm3, ymm3, ymm5 // high 4
vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4
vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4
vpackuswb ymm0, ymm0, ymm1
vmovdqu [edx], ymm0
lea edx, [edx + 32]
@ -5094,19 +5094,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBADDROW_AVX2
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
lea esi, [esi + 32]
@ -5124,21 +5124,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
#ifdef HAS_ARGBSUBTRACTROW_AVX2
// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb0
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
convertloop:
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
vmovdqu ymm0, [eax] // read 8 pixels from src_argb
lea eax, [eax + 32]
vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1
lea esi, [esi + 32]
vmovdqu [edx], ymm0
lea edx, [edx + 32]

View File

@ -1446,7 +1446,8 @@ void ScalePlaneUp2_Bilinear(int src_width,
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
// TODO(fbarchard): Test performance of writing one row of destination at a
// time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {

View File

@ -746,7 +746,8 @@ void ScaleUVBilinearUp2(int src_width,
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
// TODO(fbarchard): Test performance of writing one row of destination at a
// time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
@ -851,7 +852,8 @@ void ScaleUVBilinearUp2_16(int src_width,
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
// TODO(fbarchard): Test performance of writing one row of destination at a
// time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {

View File

@ -2404,8 +2404,7 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
}
TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(dst_pixels_opt, kPixels);
align_buffer_page_end(dst_pixels_c, kPixels);
@ -2433,8 +2432,7 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
}
TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(orig_pixels, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 4);
align_buffer_page_end(dst_pixels_c, kPixels * 4);
@ -2567,35 +2565,25 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 2);
align_buffer_page_end(tmp_pixels_u, kPixels);
align_buffer_page_end(tmp_pixels_v, kPixels);
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_u, kPixels);
align_buffer_page_end(src_pixels_v, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2);
MemRandomize(src_pixels, kPixels * 2);
MemRandomize(tmp_pixels_u, kPixels);
MemRandomize(tmp_pixels_v, kPixels);
MemRandomize(src_pixels_u, kPixels);
MemRandomize(src_pixels_v, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 2);
MemRandomize(dst_pixels_c, kPixels * 2);
MaskCpuFlags(disable_cpu_flags_);
SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
tmp_pixels_v, benchmark_width_, benchmark_width_,
benchmark_height_);
MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
tmp_pixels_v, benchmark_width_, benchmark_width_,
benchmark_height_);
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_,
dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
benchmark_height_);
}
@ -2604,119 +2592,88 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_u);
free_aligned_buffer_page_end(tmp_pixels_v);
free_aligned_buffer_page_end(src_pixels_u);
free_aligned_buffer_page_end(src_pixels_v);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
// 16 bit channel split and merge
TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 2 * 2);
align_buffer_page_end(tmp_pixels_u_c, kPixels * 2);
align_buffer_page_end(tmp_pixels_v_c, kPixels * 2);
align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2);
align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2);
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
MemRandomize(src_pixels, kPixels * 2 * 2);
MemRandomize(tmp_pixels_u_c, kPixels * 2);
MemRandomize(tmp_pixels_v_c, kPixels * 2);
MemRandomize(tmp_pixels_u_opt, kPixels * 2);
MemRandomize(tmp_pixels_v_opt, kPixels * 2);
MemRandomize(src_pixels_u, kPixels * 2);
MemRandomize(src_pixels_v, kPixels * 2);
MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
MemRandomize(dst_pixels_c, kPixels * 2 * 2);
MaskCpuFlags(disable_cpu_flags_);
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
(uint16_t*)tmp_pixels_u_c, benchmark_width_,
(uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_,
benchmark_height_, 12);
MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_,
(const uint16_t*)tmp_pixels_v_c, benchmark_width_,
MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
(const uint16_t*)src_pixels_v, benchmark_width_,
(uint16_t*)dst_pixels_c, benchmark_width_ * 2,
benchmark_width_, benchmark_height_, 12);
MaskCpuFlags(benchmark_cpu_info_);
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
(uint16_t*)tmp_pixels_u_opt, benchmark_width_,
(uint16_t*)tmp_pixels_v_opt, benchmark_width_,
benchmark_width_, benchmark_height_, 12);
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_,
(const uint16_t*)tmp_pixels_v_opt, benchmark_width_,
MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_,
(const uint16_t*)src_pixels_v, benchmark_width_,
(uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
benchmark_width_, benchmark_height_, 12);
}
for (int i = 0; i < kPixels * 2; ++i) {
EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]);
EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]);
}
for (int i = 0; i < kPixels * 2 * 2; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_u_c);
free_aligned_buffer_page_end(tmp_pixels_v_c);
free_aligned_buffer_page_end(tmp_pixels_u_opt);
free_aligned_buffer_page_end(tmp_pixels_v_opt);
free_aligned_buffer_page_end(src_pixels_u);
free_aligned_buffer_page_end(src_pixels_v);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 2);
align_buffer_page_end(tmp_pixels_u, kPixels);
align_buffer_page_end(tmp_pixels_v, kPixels);
align_buffer_page_end(dst_pixels_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2);
align_buffer_page_end(dst_pixels_u_c, kPixels);
align_buffer_page_end(dst_pixels_v_c, kPixels);
align_buffer_page_end(dst_pixels_u_opt, kPixels);
align_buffer_page_end(dst_pixels_v_opt, kPixels);
MemRandomize(src_pixels, kPixels * 2);
MemRandomize(tmp_pixels_u, kPixels);
MemRandomize(tmp_pixels_v, kPixels);
MemRandomize(dst_pixels_opt, kPixels * 2);
MemRandomize(dst_pixels_c, kPixels * 2);
MemRandomize(dst_pixels_u_c, kPixels);
MemRandomize(dst_pixels_v_c, kPixels);
MemRandomize(dst_pixels_u_opt, kPixels);
MemRandomize(dst_pixels_v_opt, kPixels);
MaskCpuFlags(disable_cpu_flags_);
SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_,
tmp_pixels_v, benchmark_width_, benchmark_width_,
benchmark_height_);
MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
dst_pixels_c, benchmark_width_ * 2, benchmark_width_,
benchmark_height_);
SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c,
benchmark_width_, dst_pixels_v_c, benchmark_width_,
benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u,
benchmark_width_, tmp_pixels_v, benchmark_width_,
SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt,
benchmark_width_, dst_pixels_v_opt, benchmark_width_,
benchmark_width_, benchmark_height_);
}
MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_,
dst_pixels_opt, benchmark_width_ * 2, benchmark_width_,
benchmark_height_);
for (int i = 0; i < kPixels * 2; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_u);
free_aligned_buffer_page_end(tmp_pixels_v);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
free_aligned_buffer_page_end(dst_pixels_u_c);
free_aligned_buffer_page_end(dst_pixels_v_c);
free_aligned_buffer_page_end(dst_pixels_u_opt);
free_aligned_buffer_page_end(dst_pixels_v_opt);
}
// 16 bit channel split
TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 2 * 2);
align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
@ -2755,7 +2712,7 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 2);
align_buffer_page_end(dst_pixels_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2);
@ -2785,7 +2742,7 @@ TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@ -2834,7 +2791,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 3);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@ -2881,8 +2838,7 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@ -2936,8 +2892,7 @@ TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@ -2991,8 +2946,7 @@ TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@ -3042,8 +2996,7 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
}
TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels, kPixels * 4);
align_buffer_page_end(tmp_pixels_r, kPixels);
align_buffer_page_end(tmp_pixels_g, kPixels);
@ -3091,30 +3044,29 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
// Merge 4 channels
#define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
const int kPixels = kWidth * benchmark_height_; \
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF); \
memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
STYPE* src_pixels_a = reinterpret_cast<STYPE*>(src_memory_a + OFF); \
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
for (int i = 0; i < kPixels; ++i) { \
src_pixels_r[i] = fastrand() & 65535; \
src_pixels_g[i] = fastrand() & 65535; \
src_pixels_b[i] = fastrand() & 65535; \
src_pixels_a[i] = fastrand() & 65535; \
} \
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
MaskCpuFlags(disable_cpu_flags_); \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \
@ -3136,27 +3088,26 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_memory_opt); \
}
// Merge 3 channel RGB into 4 channel XRGB with opaque alpha
#define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \
TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \
const int kPixels = kWidth * benchmark_height_; \
align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
for (int i = 0; i < kPixels; ++i) { \
src_pixels_r[i] = fastrand() & 65535; \
src_pixels_g[i] = fastrand() & 65535; \
src_pixels_b[i] = fastrand() & 65535; \
} \
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
MaskCpuFlags(disable_cpu_flags_); \
FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \
kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \
@ -3177,6 +3128,7 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
free_aligned_buffer_page_end(dst_memory_opt); \
}
// TODO(fbarchard): fix bug and change to benchmark_width - 1
#define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \
TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
@ -3206,16 +3158,14 @@ TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \
align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \
MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \
MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \
STYPE* src_pixels_r = reinterpret_cast<STYPE*>(src_memory_r + OFF); \
STYPE* src_pixels_g = reinterpret_cast<STYPE*>(src_memory_g + OFF); \
STYPE* src_pixels_b = reinterpret_cast<STYPE*>(src_memory_b + OFF); \
DTYPE* dst_pixels_c = reinterpret_cast<DTYPE*>(dst_memory_c); \
DTYPE* dst_pixels_opt = reinterpret_cast<DTYPE*>(dst_memory_opt); \
for (int i = 0; i < kPixels; ++i) { \
src_pixels_r[i] = fastrand() & 65535; \
src_pixels_g[i] = fastrand() & 65535; \
src_pixels_b[i] = fastrand() & 65535; \
} \
memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \
memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \
MaskCpuFlags(disable_cpu_flags_); \
@ -3238,13 +3188,13 @@ TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16)
free_aligned_buffer_page_end(dst_memory_opt); \
}
// TODO(fbarchard): Fix MergeXR30 and change _any to width - 1
#define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \
1) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \
TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0)
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10)
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12)
TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
@ -3254,6 +3204,7 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);
align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@ -3299,6 +3250,7 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@ -3334,8 +3286,7 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
#endif // HAS_MULTIPLYROW_16_AVX2
TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels * 2);
align_buffer_page_end(dst_pixels_y_opt, kPixels);
align_buffer_page_end(dst_pixels_y_c, kPixels);
@ -3414,8 +3365,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
#endif // ENABLE_ROW_TESTS
TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels);
align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_y_c, kPixels * 2);