From 60db98b6fae5e4d665c79d494c34f4d192bc4894 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 1 Apr 2021 14:20:35 -0700 Subject: [PATCH] clang-tidy applied Bug: libyuv:886, libyuv:889 Change-Id: I2d14d03c19402381256d3c6d988e0b7307bdffd8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2800147 Reviewed-by: richard winterton --- README.chromium | 2 +- include/libyuv/row.h | 338 +++++++++++++++--------------- include/libyuv/version.h | 2 +- source/compare_common.cc | 30 --- source/row_any.cc | 80 ++++--- source/row_common.cc | 360 ++++++++++++++++---------------- source/row_gcc.cc | 86 ++++---- source/row_mmi.cc | 436 +++++++++++++++++++-------------------- source/row_msa.cc | 186 ++++++++--------- source/row_neon.cc | 60 +++--- source/row_neon64.cc | 95 ++++----- source/row_win.cc | 80 +++---- source/scale.cc | 3 +- source/scale_uv.cc | 6 +- unit_test/planar_test.cc | 200 +++++++----------- 15 files changed, 958 insertions(+), 1006 deletions(-) diff --git a/README.chromium b/README.chromium index aad5369e5..92a61a6a8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1784 +Version: 1785 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 87ef32055..0e47bfbd9 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -767,7 +767,7 @@ struct YuvConstants { #else // This struct is for Intel color conversion. struct YuvConstants { -#if LIBYUV_UNLIMITED_DATA +#if defined(LIBYUV_UNLIMITED_DATA) uint8_t kUVToB[32]; uint8_t kUVToG[32]; uint8_t kUVToR[32]; @@ -1063,11 +1063,11 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); +void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); -void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); -void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width); -void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width); +void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); +void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); +void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); @@ -1262,16 +1262,16 @@ void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width); -void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width); -void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RGB24ToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width); -void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); -void RAWToYJRow_C(const uint8_t* src_argb, uint8_t* dst_yj, int width); +void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); +void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); @@ -1373,42 +1373,42 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_v, int width); void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -1417,7 +1417,7 @@ void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_v, int width); void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -1440,47 +1440,47 @@ void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -1689,7 +1689,7 @@ void MirrorSplitUVRow_C(const uint8_t* src_uv, void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); @@ -1705,9 +1705,13 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); -void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width); +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width); +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1928,23 +1932,23 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width); -void MergeARGBRow_Any_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, +void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, int width); -void MergeARGBRow_Any_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, +void MergeARGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, int width); -void MergeARGBRow_Any_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - const uint8_t* src_a, - uint8_t* dst_argb, +void MergeARGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + const uint8_t* a_buf, + uint8_t* dst_ptr, int width); void SplitARGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, @@ -1970,31 +1974,31 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_NEON(const uint8_t* src_argb, +void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_SSE2(const uint8_t* src_argb, +void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_SSSE3(const uint8_t* src_argb, +void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_AVX2(const uint8_t* src_argb, +void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); -void SplitARGBRow_Any_NEON(const uint8_t* src_argb, +void SplitARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, @@ -2020,20 +2024,20 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width); -void MergeXRGBRow_Any_SSE2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, +void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, int width); -void MergeXRGBRow_Any_AVX2(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, +void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, int width); -void MergeXRGBRow_Any_NEON(const uint8_t* src_r, - const uint8_t* src_g, - const uint8_t* src_b, - uint8_t* dst_argb, +void MergeXRGBRow_Any_NEON(const uint8_t* y_buf, + const uint8_t* u_buf, + const uint8_t* v_buf, + uint8_t* dst_ptr, int width); void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, @@ -2055,27 +2059,27 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_NEON(const uint8_t* src_argb, +void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_SSE2(const uint8_t* src_argb, +void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_SSSE3(const uint8_t* src_argb, +void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_AVX2(const uint8_t* src_argb, +void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); -void SplitXRGBRow_Any_NEON(const uint8_t* src_argb, +void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, @@ -2183,74 +2187,74 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, uint8_t* dst_argb, int depth, int width); -void MergeXR30Row_Any_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, +void MergeXR30Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, int depth, int width); -void MergeAR64Row_Any_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, +void MergeAR64Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint16_t* dst_ptr, int depth, int width); -void MergeXR64Row_Any_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, +void MergeXR64Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint16_t* dst_ptr, int depth, int width); -void MergeARGB16To8Row_Any_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, +void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, int depth, int width); -void MergeXRGB16To8Row_Any_AVX2(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, +void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, int depth, int width); -void MergeXR30Row_Any_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, +void MergeXR30Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, int depth, int width); -void MergeXR30Row_10_Any_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_ar30, +void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, int depth, int width); -void MergeAR64Row_Any_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint16_t* dst_ar64, +void MergeAR64Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint16_t* dst_ptr, int depth, int width); -void MergeARGB16To8Row_Any_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - const uint16_t* src_a, - uint8_t* dst_argb, +void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + const uint16_t* a_buf, + uint8_t* dst_ptr, int depth, int width); -void MergeXR64Row_Any_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint16_t* dst_ar64, +void MergeXR64Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint16_t* dst_ptr, int depth, int width); -void MergeXRGB16To8Row_Any_NEON(const uint16_t* src_r, - const uint16_t* src_g, - const uint16_t* src_b, - uint8_t* dst_argb, +void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf, + const uint16_t* g_buf, + const uint16_t* b_buf, + uint8_t* dst_ptr, int depth, int width); @@ -2314,16 +2318,16 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void MultiplyRow_16_Any_AVX2(const uint16_t* src_y, - uint16_t* dst_y, +void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void MultiplyRow_16_Any_NEON(const uint16_t* src_y, - uint16_t* dst_y, +void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); @@ -2335,16 +2339,16 @@ void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void DivideRow_16_Any_AVX2(const uint16_t* src_y, - uint16_t* dst_y, +void DivideRow_16_Any_AVX2(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); -void DivideRow_16_Any_NEON(const uint16_t* src_y, - uint16_t* dst_y, +void DivideRow_16_Any_NEON(const uint16_t* src_ptr, + uint16_t* dst_ptr, int scale, int width); @@ -3719,15 +3723,15 @@ void I400ToARGBRow_MMI(const uint8_t* src_y, int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, + const struct YuvConstants* param, int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, + const struct YuvConstants* param, int width); void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, + const struct YuvConstants* param, int width); void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, @@ -3739,11 +3743,11 @@ void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, int width); // ARGB preattenuated alpha blend. -void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3755,7 +3759,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBBlendRow_C(const uint8_t* src_argb0, +void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3799,11 +3803,11 @@ void BlendPlaneRow_C(const uint8_t* src0, // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBMultiplyRow_C(const uint8_t* src_argb0, +void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3811,7 +3815,7 @@ void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3819,7 +3823,7 @@ void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3845,11 +3849,11 @@ void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf, int width); // ARGB add images. -void ARGBAddRow_C(const uint8_t* src_argb0, +void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBAddRow_SSE2(const uint8_t* src_argb0, +void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3857,7 +3861,7 @@ void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBAddRow_AVX2(const uint8_t* src_argb0, +void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3865,7 +3869,7 @@ void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3892,11 +3896,11 @@ void ARGBAddRow_Any_MMI(const uint8_t* y_buf, // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. -void ARGBSubtractRow_C(const uint8_t* src_argb0, +void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); -void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3904,7 +3908,7 @@ void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -3912,7 +3916,7 @@ void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); @@ -4119,9 +4123,9 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y, - const uint8_t* src_vu, - uint8_t* dst_yuv24, +void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf, + const uint8_t* uv_buf, + uint8_t* dst_ptr, int width); void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, @@ -4323,7 +4327,7 @@ void YUY2ToUV422Row_C(const uint8_t* src_yuy2, int width); void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -4333,7 +4337,7 @@ void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, int width); void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -4343,7 +4347,7 @@ void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, int width); void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -4444,7 +4448,7 @@ void UYVYToUV422Row_C(const uint8_t* src_uyvy, int width); void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -4454,7 +4458,7 @@ void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, int width); void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -4464,7 +4468,7 @@ void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, int width); void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, - int src_stride_ptr, + int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); @@ -4501,29 +4505,29 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_C(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_uv, int width); void AYUVToVURow_C(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_vu, int width); void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_NEON(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_uv, int width); void AYUVToVURow_NEON(const uint8_t* src_ayuv, - int stride_ayuv, + int src_stride_ayuv, uint8_t* dst_vu, int width); -void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); -void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv, - int stride_ayuv, - uint8_t* dst_uv, +void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr, + int src_stride, + uint8_t* dst_vu, int width); -void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, - int stride_ayuv, +void AYUVToVURow_Any_NEON(const uint8_t* src_ptr, + int src_stride, uint8_t* dst_vu, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 21c6bc4f0..0d2bdc6ab 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1784 +#define LIBYUV_VERSION 1785 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare_common.cc b/source/compare_common.cc index d4b170ad9..d1cab8d2b 100644 --- a/source/compare_common.cc +++ b/source/compare_common.cc @@ -17,36 +17,6 @@ namespace libyuv { extern "C" { #endif -#if ORIGINAL_OPT -uint32_t HammingDistance_C1(const uint8_t* src_a, - const uint8_t* src_b, - int count) { - uint32_t diff = 0u; - - int i; - for (i = 0; i < count; ++i) { - int x = src_a[i] ^ src_b[i]; - if (x & 1) - ++diff; - if (x & 2) - ++diff; - if (x & 4) - ++diff; - if (x & 8) - ++diff; - if (x & 16) - ++diff; - if (x & 32) - ++diff; - if (x & 64) - ++diff; - if (x & 128) - ++diff; - } - return diff; -} -#endif - // Hakmem method for hamming distance. uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, diff --git a/source/row_any.cc b/source/row_any.cc index 1aebb3c07..b7668a11f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -212,11 +212,23 @@ ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 4, 7) #endif #ifdef HAS_MERGEARGB16TO8ROW_AVX2 -ANY41PT(MergeARGB16To8Row_Any_AVX2, MergeARGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +ANY41PT(MergeARGB16To8Row_Any_AVX2, + MergeARGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) #endif #ifdef HAS_MERGEARGB16TO8ROW_NEON -ANY41PT(MergeARGB16To8Row_Any_NEON, MergeARGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7) +ANY41PT(MergeARGB16To8Row_Any_NEON, + MergeARGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) #endif #undef ANY41PT @@ -487,7 +499,13 @@ ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) #ifdef HAS_MERGEXR30ROW_NEON ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) -ANY31PT(MergeXR30Row_10_Any_NEON, MergeXR30Row_10_NEON, uint16_t, 2, uint8_t, 4, 3) +ANY31PT(MergeXR30Row_10_Any_NEON, + MergeXR30Row_10_NEON, + uint16_t, + 2, + uint8_t, + 4, + 3) #endif #ifdef HAS_MERGEXR64ROW_AVX2 @@ -499,11 +517,23 @@ ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 4, 7) #endif #ifdef HAS_MERGEXRGB16TO8ROW_AVX2 -ANY31PT(MergeXRGB16To8Row_Any_AVX2, MergeXRGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15) +ANY31PT(MergeXRGB16To8Row_Any_AVX2, + MergeXRGB16To8Row_AVX2, + uint16_t, + 2, + uint8_t, + 4, + 15) #endif #ifdef HAS_MERGEXRGB16TO8ROW_NEON -ANY31PT(MergeXRGB16To8Row_Any_NEON, MergeXRGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7) +ANY31PT(MergeXRGB16To8Row_Any_NEON, + MergeXRGB16To8Row_NEON, + uint16_t, + 2, + uint8_t, + 4, + 7) #endif #undef ANY31PT @@ -1553,20 +1583,20 @@ ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7) #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ - void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ - } \ - memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ - memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ - ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \ + int width, int source_y_fraction) { \ + SIMD_ALIGNED(uint8_t temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ + } \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #ifdef HAS_INTERPOLATEROW_AVX2 @@ -1844,17 +1874,17 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 4]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ + ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ @@ -2001,17 +2031,17 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) // Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. // 128 byte row allows for 32 avx ARGB pixels. #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \ + ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ - memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ diff --git a/source/row_common.cc b/source/row_common.cc index 18b313a73..68446953d 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -553,80 +553,80 @@ static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { // Intel version mimic SSE/AVX which does 2 pavgb #if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToU(ar, ag, ab); \ - dst_v[0] = RGBToV(ar, ag, ab); \ - } \ +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToU(ar, ag, ab); \ + dst_v[0] = RGBToV(ar, ag, ab); \ + } \ } #else // ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWY(NAME, R, G, B, BPP) \ - void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = src_rgb0[B] + src_rgb1[B]; \ - uint16_t ag = src_rgb0[G] + src_rgb1[G]; \ - uint16_t ar = src_rgb0[R] + src_rgb1[R]; \ - dst_u[0] = RGB2xToU(ar, ag, ab); \ - dst_v[0] = RGB2xToV(ar, ag, ab); \ - } \ +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = src_rgb[B] + src_rgb1[B]; \ + uint16_t ag = src_rgb[G] + src_rgb1[G]; \ + uint16_t ar = src_rgb[R] + src_rgb1[R]; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ } #endif @@ -694,80 +694,80 @@ static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { // ARGBToYJ_C and ARGBToUVJ_C // Intel version mimic SSE/AVX which does 2 pavgb #if LIBYUV_ARGBTOUV_PAVGB -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ - AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ - uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ - AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ - uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ - AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ - uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ - uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ - dst_u[0] = RGBToUJ(ar, ag, ab); \ - dst_v[0] = RGBToVJ(ar, ag, ab); \ - } \ +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ + AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ + AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ + AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ } #else // ARM version does sum / 2 then multiply by 2x smaller coefficients -#define MAKEROWYJ(NAME, R, G, B, BPP) \ - void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ - int x; \ - for (x = 0; x < width; ++x) { \ - dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]); \ - src_argb0 += BPP; \ - dst_y += 1; \ - } \ - } \ - void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ - uint8_t* dst_u, uint8_t* dst_v, int width) { \ - const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ - int x; \ - for (x = 0; x < width - 1; x += 2) { \ - uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP] + 1) >> \ - 1; \ - uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP] + 1) >> \ - 1; \ - uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP] + 1) >> \ - 1; \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - src_rgb0 += BPP * 2; \ - src_rgb1 += BPP * 2; \ - dst_u += 1; \ - dst_v += 1; \ - } \ - if (width & 1) { \ - uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \ - uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \ - uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \ - dst_u[0] = RGB2xToUJ(ar, ag, ab); \ - dst_v[0] = RGB2xToVJ(ar, ag, ab); \ - } \ +#define MAKEROWYJ(NAME, R, G, B, BPP) \ + void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ + src_rgb += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + src_rgb += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToUJ(ar, ag, ab); \ + dst_v[0] = RGB2xToVJ(ar, ag, ab); \ + } \ } #endif @@ -1237,16 +1237,16 @@ void ARGBShadeRow_C(const uint8_t* src_argb, #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 16 -void ARGBMultiplyRow_C(const uint8_t* src_argb0, +void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const uint32_t b = REPEAT8(src_argb0[0]); - const uint32_t g = REPEAT8(src_argb0[1]); - const uint32_t r = REPEAT8(src_argb0[2]); - const uint32_t a = REPEAT8(src_argb0[3]); + const uint32_t b = REPEAT8(src_argb[0]); + const uint32_t g = REPEAT8(src_argb[1]); + const uint32_t r = REPEAT8(src_argb[2]); + const uint32_t a = REPEAT8(src_argb[3]); const uint32_t b_scale = src_argb1[0]; const uint32_t g_scale = src_argb1[1]; const uint32_t r_scale = src_argb1[2]; @@ -1255,7 +1255,7 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); dst_argb[3] = SHADE(a, a_scale); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1265,16 +1265,16 @@ void ARGBMultiplyRow_C(const uint8_t* src_argb0, #define SHADE(f, v) clamp255(v + f) -void ARGBAddRow_C(const uint8_t* src_argb0, +void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const int b = src_argb0[0]; - const int g = src_argb0[1]; - const int r = src_argb0[2]; - const int a = src_argb0[3]; + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; const int b_add = src_argb1[0]; const int g_add = src_argb1[1]; const int r_add = src_argb1[2]; @@ -1283,7 +1283,7 @@ void ARGBAddRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_add); dst_argb[2] = SHADE(r, r_add); dst_argb[3] = SHADE(a, a_add); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1292,16 +1292,16 @@ void ARGBAddRow_C(const uint8_t* src_argb0, #define SHADE(f, v) clamp0(f - v) -void ARGBSubtractRow_C(const uint8_t* src_argb0, +void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { - const int b = src_argb0[0]; - const int g = src_argb0[1]; - const int r = src_argb0[2]; - const int a = src_argb0[3]; + const int b = src_argb[0]; + const int g = src_argb[1]; + const int r = src_argb[2]; + const int a = src_argb[3]; const int b_sub = src_argb1[0]; const int g_sub = src_argb1[1]; const int r_sub = src_argb1[2]; @@ -1310,7 +1310,7 @@ void ARGBSubtractRow_C(const uint8_t* src_argb0, dst_argb[1] = SHADE(g, g_sub); dst_argb[2] = SHADE(r, r_sub); dst_argb[3] = SHADE(a, a_sub); - src_argb0 += 4; + src_argb += 4; src_argb1 += 4; dst_argb += 4; } @@ -1486,7 +1486,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. -#if defined(LIBYUV_UNLIMITED_DATA) +#if LIBYUV_UNLIMITED_DATA #define UB 129 /* round(2.018 * 64) */ #else #define UB 128 /* max(128, round(2.018 * 64)) */ @@ -1540,7 +1540,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR) // KR = 0.2126, KB = 0.0722 // U and V contributions to R,G,B. -#if defined(LIBYUV_UNLIMITED_DATA) +#if LIBYUV_UNLIMITED_DATA #define UB 135 /* round(2.112 * 64) */ #else #define UB 128 /* max(128, round(2.112 * 64)) */ @@ -1594,7 +1594,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR) // KR = 0.2627; KB = 0.0593 // U and V contributions to R,G,B. -#if defined(LIBYUV_UNLIMITED_DATA) +#if LIBYUV_UNLIMITED_DATA #define UB 137 /* round(2.142 * 64) */ #else #define UB 128 /* max(128, round(2.142 * 64)) */ @@ -1646,7 +1646,7 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #undef MAKEYUVCONSTANTS -#if defined(LIBYUV_UNLIMITED_DATA) +#if LIBYUV_UNLIMITED_DATA // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 16 bit. @@ -3347,19 +3347,19 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_C(const uint8_t* src_argb0, +void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { - uint32_t fb = src_argb0[0]; - uint32_t fg = src_argb0[1]; - uint32_t fr = src_argb0[2]; - uint32_t a = src_argb0[3]; + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; @@ -3368,10 +3368,10 @@ void ARGBBlendRow_C(const uint8_t* src_argb0, dst_argb[2] = BLEND(fr, br, a); dst_argb[3] = 255u; - fb = src_argb0[4 + 0]; - fg = src_argb0[4 + 1]; - fr = src_argb0[4 + 2]; - a = src_argb0[4 + 3]; + fb = src_argb[4 + 0]; + fg = src_argb[4 + 1]; + fr = src_argb[4 + 2]; + a = src_argb[4 + 3]; bb = src_argb1[4 + 0]; bg = src_argb1[4 + 1]; br = src_argb1[4 + 2]; @@ -3379,16 +3379,16 @@ void ARGBBlendRow_C(const uint8_t* src_argb0, dst_argb[4 + 1] = BLEND(fg, bg, a); dst_argb[4 + 2] = BLEND(fr, br, a); dst_argb[4 + 3] = 255u; - src_argb0 += 8; + src_argb += 8; src_argb1 += 8; dst_argb += 8; } if (width & 1) { - uint32_t fb = src_argb0[0]; - uint32_t fg = src_argb0[1]; - uint32_t fr = src_argb0[2]; - uint32_t a = src_argb0[3]; + uint32_t fb = src_argb[0]; + uint32_t fg = src_argb[1]; + uint32_t fr = src_argb[2]; + uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 3b63fe2d2..a70325903 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1160,7 +1160,7 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, : "memory", "cc", "xmm0", "xmm1"); } -void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64, +void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( @@ -1178,7 +1178,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ar64), // %0 + : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleARGBToABGR) // %3 @@ -1267,7 +1267,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, #endif #ifdef HAS_AB64TOARGBROW_AVX2 -void AB64ToARGBRow_AVX2(const uint16_t* src_ar64, +void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( @@ -1286,7 +1286,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ar64, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_ar64), // %0 + : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleARGBToABGR) // %3 @@ -1506,7 +1506,7 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { #endif // HAS_RGBATOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 -void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1558,7 +1558,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1575,7 +1575,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; -void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1623,7 +1623,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1638,7 +1638,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ABGRTOUVROW_AVX2 -void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, @@ -1686,7 +1686,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_abgr0), // %0 + : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1701,7 +1701,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr0, #endif // HAS_ABGRTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1750,7 +1750,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1765,7 +1765,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1818,7 +1818,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -1905,7 +1905,7 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "xmm7"); } -void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, +void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, @@ -1957,7 +1957,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_bgra0), // %0 + : "+r"(src_bgra), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -2002,7 +2002,7 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "xmm7"); } -void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, +void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, @@ -2054,7 +2054,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_abgr0), // %0 + : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -2065,7 +2065,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } -void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, +void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, @@ -2117,7 +2117,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_rgba0), // %0 + : "+r"(src_rgba), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 @@ -5741,7 +5741,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, #if defined(__i386__) : "m"(shift) // %5 #else - : "rm"(shift) // %5 + : "rm"(shift) // %5 #endif : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -5813,9 +5813,9 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, : "m"(shift), // %6 "m"(mask), // %7 #else - "+rm"(width) // %5 - : "rm"(shift), // %6 - "rm"(mask), // %7 + "+rm"(width) // %5 + : "rm"(shift), // %6 + "rm"(mask), // %7 #endif "m"(MergeAR64Permute) // %8 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", @@ -5882,8 +5882,8 @@ void MergeXR64Row_AVX2(const uint16_t* src_r, : "m"(shift), // %5 "m"(mask), // %6 #else - : "rm"(shift), // %5 - "rm"(mask), // %6 + : "rm"(shift), // %5 + "rm"(mask), // %6 #endif "m"(MergeAR64Permute) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", @@ -5944,8 +5944,8 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, "+m"(width) // %5 : "m"(shift), // %6 #else - "+rm"(width) // %5 - : "rm"(shift), // %6 + "+rm"(width) // %5 + : "rm"(shift), // %6 #endif "m"(MergeARGB16To8Shuffle) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); @@ -6000,7 +6000,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, #if defined(__i386__) : "m"(shift), // %5 #else - : "rm"(shift), // %5 + : "rm"(shift), // %5 #endif "m"(MergeARGB16To8Shuffle) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); @@ -6732,7 +6732,7 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time -void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -6803,7 +6803,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, "sub $0x1,%3 \n" "jge 91b \n" "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7405,7 +7405,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -7433,7 +7433,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7444,7 +7444,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -7471,7 +7471,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7482,7 +7482,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_SSE2(const uint8_t* src_argb0, +void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -7499,7 +7499,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7510,7 +7510,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. -void ARGBAddRow_AVX2(const uint8_t* src_argb0, +void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -7527,7 +7527,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7538,7 +7538,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. -void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -7555,7 +7555,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -7566,7 +7566,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -7583,7 +7583,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 diff --git a/source/row_mmi.cc b/source/row_mmi.cc index 9a8e2cb2d..362fd1cfc 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -605,7 +605,7 @@ void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -613,8 +613,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -626,8 +626,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -639,8 +639,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -652,8 +652,8 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -671,20 +671,20 @@ void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ARGBToUVRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -700,9 +700,9 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -720,8 +720,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -748,8 +748,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -767,8 +767,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -795,8 +795,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -814,8 +814,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -842,8 +842,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -861,8 +861,8 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -901,7 +901,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -913,7 +913,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -921,7 +921,7 @@ void ARGBToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -929,8 +929,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -942,8 +942,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -955,8 +955,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -968,8 +968,8 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -987,20 +987,20 @@ void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void BGRAToUVRow_MMI(const uint8_t* src_rgb0, +void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1016,9 +1016,9 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1036,8 +1036,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1064,8 +1064,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1083,8 +1083,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1111,8 +1111,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1130,8 +1130,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1158,8 +1158,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1177,8 +1177,8 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1217,7 +1217,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1229,7 +1229,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -1237,7 +1237,7 @@ void BGRAToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1245,8 +1245,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1258,8 +1258,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1271,8 +1271,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1284,8 +1284,8 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1303,20 +1303,20 @@ void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void ABGRToUVRow_MMI(const uint8_t* src_rgb0, +void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1332,9 +1332,9 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1352,8 +1352,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1380,8 +1380,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1399,8 +1399,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1427,8 +1427,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1446,8 +1446,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1474,8 +1474,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1493,8 +1493,8 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1533,7 +1533,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1545,7 +1545,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -1553,7 +1553,7 @@ void ABGRToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1561,8 +1561,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1574,8 +1574,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x08(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1587,8 +1587,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x17(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x10(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1600,8 +1600,8 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x1f(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x18(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1619,20 +1619,20 @@ void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x20 \n\t" + "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGBAToUVRow_MMI(const uint8_t* src_rgb0, +void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1648,9 +1648,9 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1668,8 +1668,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1696,8 +1696,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1715,8 +1715,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1743,8 +1743,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1762,8 +1762,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1790,8 +1790,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1809,8 +1809,8 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1849,7 +1849,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -1861,7 +1861,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -1869,7 +1869,7 @@ void RGBAToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -1877,8 +1877,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1891,8 +1891,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1905,8 +1905,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1919,8 +1919,8 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -1939,20 +1939,20 @@ void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, +void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -1968,9 +1968,9 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -1990,8 +1990,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2020,8 +2020,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2041,8 +2041,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2071,8 +2071,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2092,8 +2092,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2122,8 +2122,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2143,8 +2143,8 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2185,7 +2185,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2197,7 +2197,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -2205,7 +2205,7 @@ void RGB24ToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; @@ -2213,8 +2213,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { __asm__ volatile( "1: \n\t" - "gsldlc1 %[src], 0x07(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x00(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2227,8 +2227,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" - "gsldlc1 %[src], 0x0d(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x06(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2241,8 +2241,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" - "gsldlc1 %[src], 0x13(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x0c(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2255,8 +2255,8 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" - "gsldlc1 %[src], 0x19(%[src_argb0]) \n\t" - "gsldrc1 %[src], 0x12(%[src_argb0]) \n\t" + "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" + "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" @@ -2275,20 +2275,20 @@ void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" - "daddiu %[src_argb0], %[src_argb0], 0x18 \n\t" + "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) - : [src_argb0] "r"(src_argb0), [dst_y] "r"(dst_y), [width] "r"(width), + : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } -void RAWToUVRow_MMI(const uint8_t* src_rgb0, +void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -2304,9 +2304,9 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2326,8 +2326,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x06(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0d(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2356,8 +2356,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x0c(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x13(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2377,8 +2377,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x12(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x19(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2407,8 +2407,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2428,8 +2428,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x1e(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x25(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2458,8 +2458,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x24(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2b(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2479,8 +2479,8 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x2a(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x31(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2521,7 +2521,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x30 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2533,7 +2533,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), @@ -2541,7 +2541,7 @@ void RAWToUVRow_MMI(const uint8_t* src_rgb0, : "memory"); } -void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest, dest0, dest1, dest2, dest3; uint64_t tmp0, tmp1; @@ -2618,13 +2618,13 @@ void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width) { [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1) - : [src_ptr] "r"(src_argb0), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), + : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), [width] "r"(width) : "memory"); } -void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, +void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, @@ -2637,9 +2637,9 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, __asm__ volatile( "1: \n\t" - "daddu %[src_rgb1], %[src_rgb0], %[src_stride_rgb] \n\t" - "gsldrc1 %[src0], 0x00(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x07(%[src_rgb0]) \n\t" + "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" + "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2655,8 +2655,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x08(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x0f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2681,8 +2681,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x10(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x17(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2698,8 +2698,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x18(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x1f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2724,8 +2724,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x20(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x27(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2741,8 +2741,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x28(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x2f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2767,8 +2767,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" - "gsldrc1 %[src0], 0x30(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x37(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2784,8 +2784,8 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" - "gsldrc1 %[src0], 0x38(%[src_rgb0]) \n\t" - "gsldlc1 %[src0], 0x3f(%[src_rgb0]) \n\t" + "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" + "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" @@ -2822,7 +2822,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" - "daddiu %[src_rgb0], %[src_rgb0], 0x40 \n\t" + "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" @@ -2833,7 +2833,7 @@ void ARGBToUVJRow_MMI(const uint8_t* src_rgb0, [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) - : [src_rgb0] "r"(src_rgb0), [src_stride_rgb] "r"(src_stride_rgb), + : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), @@ -4386,7 +4386,7 @@ void ARGBShadeRow_MMI(const uint8_t* src_argb, : "memory"); } -void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, +void ARGBMultiplyRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4422,12 +4422,12 @@ void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) : "memory"); } -void ARGBAddRow_MMI(const uint8_t* src_argb0, +void ARGBAddRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4449,12 +4449,12 @@ void ARGBAddRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } -void ARGBSubtractRow_MMI(const uint8_t* src_argb0, +void ARGBSubtractRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -4476,7 +4476,7 @@ void ARGBSubtractRow_MMI(const uint8_t* src_argb0, "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } @@ -5552,10 +5552,10 @@ void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { : "memory"); } -// Blend src_argb0 over src_argb1 and store to dst_argb. -// dst_argb may be src_argb0 or src_argb1. +// Blend src_argb over src_argb1 and store to dst_argb. +// dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. -void ARGBBlendRow_MMI(const uint8_t* src_argb0, +void ARGBBlendRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -5608,7 +5608,7 @@ void ARGBBlendRow_MMI(const uint8_t* src_argb0, [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) - : [src0_ptr] "r"(src_argb0), [src1_ptr] "r"(src_argb1), + : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [shift] "f"(shift), [width] "r"(width) diff --git a/source/row_msa.cc b/source/row_msa.cc index fe6df93a6..c0b13b0fd 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -781,7 +781,7 @@ void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, } } -void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; @@ -792,10 +792,10 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); @@ -822,18 +822,18 @@ void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ARGBToUVRow_MSA(const uint8_t* src_argb0, +void ARGBToUVRow_MSA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* src_argb0_next = src_argb0 + src_stride_argb; + const uint8_t* src_argb_next = src_argb + src_stride_argb; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; @@ -847,14 +847,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 32) { - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); @@ -875,14 +875,14 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, reg3 = __msa_hadd_u_h(vec5, vec5); reg4 = __msa_hadd_u_h(vec0, vec0); reg5 = __msa_hadd_u_h(vec1, vec1); - src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); - src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); - src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); - src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); - src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); - src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); - src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); - src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); + src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); + src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); + src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); + src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); + src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); + src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); + src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); + src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); @@ -945,8 +945,8 @@ void ARGBToUVRow_MSA(const uint8_t* src_argb0, dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); - src_argb0 += 128; - src_argb0_next += 128; + src_argb += 128; + src_argb_next += 128; dst_u += 16; dst_v += 16; } @@ -1173,7 +1173,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb, } } -void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, +void ARGBMultiplyRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1184,7 +1184,7 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, v8i16 zero = {0}; for (x = 0; x < width; x += 4) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); @@ -1206,13 +1206,13 @@ void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_argb); - src_argb0 += 16; + src_argb += 16; src_argb1 += 16; dst_argb += 16; } } -void ARGBAddRow_MSA(const uint8_t* src_argb0, +void ARGBAddRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1220,20 +1220,20 @@ void ARGBAddRow_MSA(const uint8_t* src_argb0, v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } } -void ARGBSubtractRow_MSA(const uint8_t* src_argb0, +void ARGBSubtractRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -1241,14 +1241,14 @@ void ARGBSubtractRow_MSA(const uint8_t* src_argb0, v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_subs_u_b(src0, src2); dst1 = __msa_subs_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } @@ -1794,7 +1794,7 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { } } -void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1809,9 +1809,9 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1830,12 +1830,12 @@ void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); - src_argb0 += 48; + src_argb += 48; dst_y += 16; } } -void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; @@ -1850,9 +1850,9 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v16i8 zero = {0}; for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); @@ -1871,7 +1871,7 @@ void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); - src_argb0 += 48; + src_argb += 48; dst_y += 16; } } @@ -2037,14 +2037,14 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, } } -void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, +void RGB24ToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; @@ -2147,14 +2147,14 @@ void RGB24ToUVRow_MSA(const uint8_t* src_rgb0, } } -void RAWToUVRow_MSA(const uint8_t* src_rgb0, +void RAWToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; @@ -2446,7 +2446,7 @@ void SobelXYRow_MSA(const uint8_t* src_sobelx, } } -void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); @@ -2454,19 +2454,19 @@ void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); @@ -2474,19 +2474,19 @@ void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); @@ -2494,19 +2494,19 @@ void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { +void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); @@ -2514,26 +2514,26 @@ void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) { v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); - src2 = (v16u8)__msa_ld_b((void*)src_argb0, 32); - src3 = (v16u8)__msa_ld_b((void*)src_argb0, 48); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); + src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); + src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); - src_argb0 += 64; + src_argb += 64; dst_y += 16; } } -void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, +void ARGBToUVJRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; v8u16 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 vec0, vec1, vec2, vec3; v8u16 dst0, dst1, dst2, dst3; @@ -2658,14 +2658,14 @@ void ARGBToUVJRow_MSA(const uint8_t* src_rgb0, } } -void BGRAToUVRow_MSA(const uint8_t* src_rgb0, +void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; @@ -2693,14 +2693,14 @@ void BGRAToUVRow_MSA(const uint8_t* src_rgb0, } } -void ABGRToUVRow_MSA(const uint8_t* src_rgb0, +void ABGRToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; @@ -2728,14 +2728,14 @@ void ABGRToUVRow_MSA(const uint8_t* src_rgb0, } } -void RGBAToUVRow_MSA(const uint8_t* src_rgb0, +void RGBAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; - const uint8_t* s = src_rgb0; - const uint8_t* t = src_rgb0 + src_stride_rgb; + const uint8_t* s = src_rgb; + const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; @@ -3109,7 +3109,7 @@ void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, } } -void ARGBBlendRow_MSA(const uint8_t* src_argb0, +void ARGBBlendRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -3123,8 +3123,8 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, v16i8 zero = {0}; for (x = 0; x < width; x += 8) { - src0 = (v16u8)__msa_ld_b((void*)src_argb0, 0); - src1 = (v16u8)__msa_ld_b((void*)src_argb0, 16); + src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); + src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); @@ -3168,7 +3168,7 @@ void ARGBBlendRow_MSA(const uint8_t* src_argb0, dst0 = __msa_bmnz_v(dst0, const_255, mask); dst1 = __msa_bmnz_v(dst1, const_255, mask); ST_UB2(dst0, dst1, dst_argb, 16); - src_argb0 += 32; + src_argb += 32; src_argb1 += 32; dst_argb += 32; } diff --git a/source/row_neon.cc b/source/row_neon.cc index ce0759f56..d4a992e74 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -415,11 +415,11 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_argb), // %2 @@ -438,11 +438,11 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_argb), // %2 @@ -537,11 +537,11 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -558,11 +558,11 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1680,7 +1680,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } -void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient @@ -1694,7 +1694,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" - : "+r"(src_argb), // %0 + : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : @@ -2655,7 +2655,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2706,7 +2706,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2944,7 +2944,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2964,7 +2964,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2973,7 +2973,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2987,7 +2987,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, "vqadd.u8 q1, q1, q3 \n" // add R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2996,7 +2996,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -3010,7 +3010,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, "vqsub.u8 q1, q1, q3 \n" // subtract R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 517d38aea..350c9646a 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -909,7 +909,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, "+r"(src_b), // %2 "+r"(dst_ar30), // %3 "+r"(width) // %4 - : "r"(shift) // %5 + : "r"(shift) // %5 : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); } @@ -1305,10 +1305,10 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v3.8b, v1.8b, v1.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "orr v4.8b, v0.8b, v0.8b \n" // move r + "orr v4.8b, v0.8b, v0.8b \n" // move r "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1324,10 +1324,10 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v2.8b, v4.8b, v4.8b \n" // move g + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v2.8b, v4.8b, v4.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" - "orr v1.8b, v5.8b, v5.8b \n" // move r + "orr v1.8b, v5.8b, v5.8b \n" // move r "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r "b.gt 1b \n" : "+r"(src_raw), // %0 @@ -1377,8 +1377,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - RGB565TOARGB + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_rgb565), // %0 @@ -1467,8 +1466,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - ARGB4444TOARGB + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb4444), // %0 @@ -1485,7 +1483,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of // RGB24 @@ -1502,8 +1500,8 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v4.8b, v2.8b, v2.8b \n" // mov g + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "orr v4.8b, v2.8b, v2.8b \n" // mov g "prfm pldl1keep, [%0, 448] \n" "orr v5.8b, v1.8b, v1.8b \n" // mov b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b @@ -1676,7 +1674,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "subs %w4, %w4, #16 \n" // 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels "orr v2.8b, v1.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us @@ -1724,8 +1722,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - ARGBTORGB565 + "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1766,8 +1763,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - ARGBTOARGB1555 + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1787,8 +1783,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. - "prfm pldl1keep, [%0, 448] \n" - ARGBTOARGB4444 + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -1956,7 +1951,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } -void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { +void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #29 \n" // B * 0.1140 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient @@ -1971,7 +1966,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" - : "+r"(src_argb), // %0 + : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : @@ -2668,8 +2663,8 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R @@ -2692,8 +2687,8 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v16.8h, v0.8b, v4.8b \n" // B + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v16.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R @@ -2715,8 +2710,8 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R @@ -2737,8 +2732,8 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "movi v4.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // B + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "umull v0.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R @@ -2818,7 +2813,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr -void ARGBBlendRow_NEON(const uint8_t* src_argb0, +void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -2880,7 +2875,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "99: \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -2900,11 +2895,11 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a "prfm pldl1keep, [%0, 448] \n" - "umull v5.8h, v1.8b, v3.8b \n" // g * a - "umull v6.8h, v2.8b, v3.8b \n" // r * a - "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 - "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 - "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 + "umull v5.8h, v1.8b, v3.8b \n" // g * a + "umull v6.8h, v2.8b, v3.8b \n" // r * a + "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 + "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 + "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 @@ -2930,8 +2925,8 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "prfm pldl1keep, [%0, 448] \n" "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" @@ -3040,8 +3035,8 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "movi v30.8b, #50 \n" // BR coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v2.8b, v22.8b \n" // R @@ -3127,7 +3122,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, +void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -3149,7 +3144,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -3158,7 +3153,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, } // Add 2 rows of ARGB pixels together, 8 pixels at a time. -void ARGBAddRow_NEON(const uint8_t* src_argb0, +void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -3176,7 +3171,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, "uqadd v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -3185,7 +3180,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. -void ARGBSubtractRow_NEON(const uint8_t* src_argb0, +void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { @@ -3203,7 +3198,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, "uqsub v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" - : "+r"(src_argb0), // %0 + : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 @@ -3703,9 +3698,9 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values "prfm pldl1keep, [%0, 448] \n" - "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels per loop "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels diff --git a/source/row_win.cc b/source/row_win.cc index 951518926..78256f8ec 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1427,7 +1427,7 @@ __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, } } -__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1573,7 +1573,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1641,7 +1641,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1709,7 +1709,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, } #endif // HAS_ARGBTOUVJROW_AVX2 -__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { @@ -1767,7 +1767,7 @@ __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1839,7 +1839,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1911,7 +1911,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0, } } -__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -4347,13 +4347,13 @@ static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. -__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4442,7 +4442,7 @@ __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0xff000000 @@ -4487,7 +4487,7 @@ __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax @@ -4581,7 +4581,7 @@ __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { - mov eax, [esp + 4] // src_argb0 + mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax @@ -4937,20 +4937,20 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 @@ -4958,8 +4958,8 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, punpckhbw xmm1, xmm1 // next 2 punpcklbw xmm2, xmm5 // first 2 punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 + pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4977,13 +4977,13 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4992,11 +4992,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, jl convertloop49 convertloop4: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -5007,11 +5007,11 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, jl convertloop19 convertloop1: - movd xmm0, [eax] // read 1 pixels from src_argb0 + movd xmm0, [eax] // read 1 pixels from src_argb lea eax, [eax + 4] movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb0 + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -5026,23 +5026,23 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - movdqu xmm0, [eax] // read 4 pixels from src_argb0 + movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb0 - src_argb1 + psubusb xmm0, xmm1 // src_argb - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -5056,20 +5056,20 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: - vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm1, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] @@ -5077,8 +5077,8 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4 vpunpckhbw ymm3, ymm3, ymm5 // high 4 - vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 - vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -5094,19 +5094,19 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] @@ -5124,21 +5124,21 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0, +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb0 + mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: - vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] - vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 + vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] diff --git a/source/scale.cc b/source/scale.cc index a254737c0..03b0486f7 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1446,7 +1446,8 @@ void ScalePlaneUp2_Bilinear(int src_width, for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 7b977912f..d9a314453 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -746,7 +746,8 @@ void ScaleUVBilinearUp2(int src_width, for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { @@ -851,7 +852,8 @@ void ScaleUVBilinearUp2_16(int src_width, for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; - // TODO: Test performance of writing one row of destination at a time. + // TODO(fbarchard): Test performance of writing one row of destination at a + // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 3449c864f..a5cdfd712 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2404,8 +2404,7 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) { } TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(dst_pixels_opt, kPixels); align_buffer_page_end(dst_pixels_c, kPixels); @@ -2433,8 +2432,7 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { } TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(orig_pixels, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); @@ -2567,35 +2565,25 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) { } TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; - align_buffer_page_end(src_pixels, kPixels * 2); - align_buffer_page_end(tmp_pixels_u, kPixels); - align_buffer_page_end(tmp_pixels_v, kPixels); + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels); + align_buffer_page_end(src_pixels_v, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2); - MemRandomize(src_pixels, kPixels * 2); - MemRandomize(tmp_pixels_u, kPixels); - MemRandomize(tmp_pixels_v, kPixels); + MemRandomize(src_pixels_u, kPixels); + MemRandomize(src_pixels_v, kPixels); MemRandomize(dst_pixels_opt, kPixels * 2); MemRandomize(dst_pixels_c, kPixels * 2); MaskCpuFlags(disable_cpu_flags_); - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, - tmp_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_); - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, - tmp_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_); - for (int i = 0; i < benchmark_iterations_; ++i) { - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, + MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_); } @@ -2604,119 +2592,88 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } - free_aligned_buffer_page_end(src_pixels); - free_aligned_buffer_page_end(tmp_pixels_u); - free_aligned_buffer_page_end(tmp_pixels_v); + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } // 16 bit channel split and merge TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; - align_buffer_page_end(src_pixels, kPixels * 2 * 2); - align_buffer_page_end(tmp_pixels_u_c, kPixels * 2); - align_buffer_page_end(tmp_pixels_v_c, kPixels * 2); - align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2); - align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2); + const int kPixels = benchmark_width_ * benchmark_height_; + align_buffer_page_end(src_pixels_u, kPixels * 2); + align_buffer_page_end(src_pixels_v, kPixels * 2); align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2); - MemRandomize(src_pixels, kPixels * 2 * 2); - MemRandomize(tmp_pixels_u_c, kPixels * 2); - MemRandomize(tmp_pixels_v_c, kPixels * 2); - MemRandomize(tmp_pixels_u_opt, kPixels * 2); - MemRandomize(tmp_pixels_v_opt, kPixels * 2); + MemRandomize(src_pixels_u, kPixels * 2); + MemRandomize(src_pixels_v, kPixels * 2); MemRandomize(dst_pixels_opt, kPixels * 2 * 2); MemRandomize(dst_pixels_c, kPixels * 2 * 2); MaskCpuFlags(disable_cpu_flags_); - SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, - (uint16_t*)tmp_pixels_u_c, benchmark_width_, - (uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_, - benchmark_height_, 12); - MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_, - (const uint16_t*)tmp_pixels_v_c, benchmark_width_, + MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_, + (const uint16_t*)src_pixels_v, benchmark_width_, (uint16_t*)dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_, 12); MaskCpuFlags(benchmark_cpu_info_); - SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, - (uint16_t*)tmp_pixels_u_opt, benchmark_width_, - (uint16_t*)tmp_pixels_v_opt, benchmark_width_, - benchmark_width_, benchmark_height_, 12); - for (int i = 0; i < benchmark_iterations_; ++i) { - MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_, - (const uint16_t*)tmp_pixels_v_opt, benchmark_width_, + MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_, + (const uint16_t*)src_pixels_v, benchmark_width_, (uint16_t*)dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_, 12); } - for (int i = 0; i < kPixels * 2; ++i) { - EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]); - EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]); - } for (int i = 0; i < kPixels * 2 * 2; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } - free_aligned_buffer_page_end(src_pixels); - free_aligned_buffer_page_end(tmp_pixels_u_c); - free_aligned_buffer_page_end(tmp_pixels_v_c); - free_aligned_buffer_page_end(tmp_pixels_u_opt); - free_aligned_buffer_page_end(tmp_pixels_v_opt); + free_aligned_buffer_page_end(src_pixels_u); + free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2); - align_buffer_page_end(tmp_pixels_u, kPixels); - align_buffer_page_end(tmp_pixels_v, kPixels); - align_buffer_page_end(dst_pixels_opt, kPixels * 2); - align_buffer_page_end(dst_pixels_c, kPixels * 2); + align_buffer_page_end(dst_pixels_u_c, kPixels); + align_buffer_page_end(dst_pixels_v_c, kPixels); + align_buffer_page_end(dst_pixels_u_opt, kPixels); + align_buffer_page_end(dst_pixels_v_opt, kPixels); MemRandomize(src_pixels, kPixels * 2); - MemRandomize(tmp_pixels_u, kPixels); - MemRandomize(tmp_pixels_v, kPixels); - MemRandomize(dst_pixels_opt, kPixels * 2); - MemRandomize(dst_pixels_c, kPixels * 2); + MemRandomize(dst_pixels_u_c, kPixels); + MemRandomize(dst_pixels_v_c, kPixels); + MemRandomize(dst_pixels_u_opt, kPixels); + MemRandomize(dst_pixels_v_opt, kPixels); MaskCpuFlags(disable_cpu_flags_); - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, benchmark_width_, - tmp_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_); - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, - dst_pixels_c, benchmark_width_ * 2, benchmark_width_, - benchmark_height_); + SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c, + benchmark_width_, dst_pixels_v_c, benchmark_width_, + benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { - SplitUVPlane(src_pixels, benchmark_width_ * 2, tmp_pixels_u, - benchmark_width_, tmp_pixels_v, benchmark_width_, + SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt, + benchmark_width_, dst_pixels_v_opt, benchmark_width_, benchmark_width_, benchmark_height_); } - MergeUVPlane(tmp_pixels_u, benchmark_width_, tmp_pixels_v, benchmark_width_, - dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, - benchmark_height_); - for (int i = 0; i < kPixels * 2; ++i) { - EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + for (int i = 0; i < kPixels; ++i) { + EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]); + EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]); } free_aligned_buffer_page_end(src_pixels); - free_aligned_buffer_page_end(tmp_pixels_u); - free_aligned_buffer_page_end(tmp_pixels_v); - free_aligned_buffer_page_end(dst_pixels_opt); - free_aligned_buffer_page_end(dst_pixels_c); + free_aligned_buffer_page_end(dst_pixels_u_c); + free_aligned_buffer_page_end(dst_pixels_v_c); + free_aligned_buffer_page_end(dst_pixels_u_opt); + free_aligned_buffer_page_end(dst_pixels_v_opt); } // 16 bit channel split TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2 * 2); align_buffer_page_end(dst_pixels_u_c, kPixels * 2); align_buffer_page_end(dst_pixels_v_c, kPixels * 2); @@ -2755,7 +2712,7 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) { TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) { // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2); align_buffer_page_end(dst_pixels_opt, kPixels * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2); @@ -2785,7 +2742,7 @@ TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) { TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) { // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 3); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2834,7 +2791,7 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) { TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 3); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2881,8 +2838,7 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2936,8 +2892,7 @@ TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -2991,8 +2946,7 @@ TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -3042,8 +2996,7 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) { } TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); @@ -3091,30 +3044,29 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// Merge 4 channels #define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \ + const int kPixels = kWidth * benchmark_height_; \ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ + memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \ STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ STYPE* src_pixels_a = reinterpret_cast(src_memory_a + OFF); \ DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ - for (int i = 0; i < kPixels; ++i) { \ - src_pixels_r[i] = fastrand() & 65535; \ - src_pixels_g[i] = fastrand() & 65535; \ - src_pixels_b[i] = fastrand() & 65535; \ - src_pixels_a[i] = fastrand() & 65535; \ - } \ - memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ - memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ MaskCpuFlags(disable_cpu_flags_); \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \ @@ -3136,27 +3088,26 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { free_aligned_buffer_page_end(dst_memory_opt); \ } +// Merge 3 channel RGB into 4 channel XRGB with opaque alpha #define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kPixels = (kWidth * benchmark_height_ + 15) & ~15; \ + const int kPixels = kWidth * benchmark_height_; \ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ + memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \ + memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \ STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ - for (int i = 0; i < kPixels; ++i) { \ - src_pixels_r[i] = fastrand() & 65535; \ - src_pixels_g[i] = fastrand() & 65535; \ - src_pixels_b[i] = fastrand() & 65535; \ - } \ - memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ - memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ MaskCpuFlags(disable_cpu_flags_); \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \ @@ -3177,6 +3128,7 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { free_aligned_buffer_page_end(dst_memory_opt); \ } +// TODO(fbarchard): fix bug and change to benchmark_width - 1 #define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ @@ -3206,16 +3158,14 @@ TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16) align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ + MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ + MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ - for (int i = 0; i < kPixels; ++i) { \ - src_pixels_r[i] = fastrand() & 65535; \ - src_pixels_g[i] = fastrand() & 65535; \ - src_pixels_b[i] = fastrand() & 65535; \ - } \ memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ MaskCpuFlags(disable_cpu_flags_); \ @@ -3238,13 +3188,13 @@ TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16) free_aligned_buffer_page_end(dst_memory_opt); \ } +// TODO(fbarchard): Fix MergeXR30 and change _any to width - 1 #define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ - 4, _Any, +, 0) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ 1) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) - TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10) TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12) TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) @@ -3254,6 +3204,7 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { // Round count up to multiple of 16 const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2); align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2); @@ -3299,6 +3250,7 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { // Round count up to multiple of 16 const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2); @@ -3334,8 +3286,7 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { #endif // HAS_MULTIPLYROW_16_AVX2 TEST_F(LibYUVPlanarTest, Convert16To8Plane) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels); align_buffer_page_end(dst_pixels_y_c, kPixels); @@ -3414,8 +3365,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { #endif // ENABLE_ROW_TESTS TEST_F(LibYUVPlanarTest, Convert8To16Plane) { - // Round count up to multiple of 16 - const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2);