diff --git a/README.chromium b/README.chromium index dbc7f68b9..f97dcea59 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1945 +Version: 1946 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f38c9fb5a..5aced2a2a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -36,25 +36,16 @@ extern "C" { // The following are available on clang x86 platforms: #if defined(USE_ROW_GCC) // Conversions: -#define HAS_ARGB1555TOARGBROW_SSE2 -#define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 #define HAS_ARGBSHUFFLEROW_SSSE3 -#define HAS_ARGBTOARGB1555ROW_SSE2 -#define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 -#define HAS_ARGBTORGB565DITHERROW_SSE2 -#define HAS_ARGBTORGB565ROW_SSE2 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 -#define HAS_I422TOARGB1555ROW_SSSE3 -#define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TORGB24ROW_SSSE3 -#define HAS_I422TORGB565ROW_SSSE3 -#define HAS_I422TORGBAROW_SSSE3 +#define HAS_I422TORGBBAROW_SSSE3 #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 @@ -63,13 +54,11 @@ extern "C" { #define HAS_MIRRORROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB24ROW_SSSE3 -#define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 #define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 -#define HAS_RGB565TOARGBROW_SSE2 #define HAS_SETROW_ERMS #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 @@ -162,11 +151,8 @@ extern "C" { #define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_COPYROW_AVX #define HAS_HALFFLOATROW_AVX2 -#define HAS_I422TOARGB1555ROW_AVX2 -#define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 #define HAS_I422TORGB24ROW_AVX2 -#define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TORGBAROW_AVX2 #define HAS_I444TOARGBROW_AVX2 #define HAS_I444TORGB24ROW_AVX2 @@ -175,7 +161,6 @@ extern "C" { #define HAS_MIRRORSPLITUVROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB24ROW_AVX2 -#define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TORGB24ROW_AVX2 #define HAS_SPLITUVROW_AVX2 @@ -236,6 +221,10 @@ extern "C" { #define HAS_P410TOAR30ROW_SSSE3 #define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_AVX2 +#define HAS_RGB24TOARGBROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITARGBROW_SSE2 @@ -359,6 +348,10 @@ extern "C" { ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) #define HAS_RAWTOARGBROW_AVX2 +#define HAS_RGB24TOARGBROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 #if defined(__x86_64__) || defined(_M_X64) #define HAS_RAWTOARGBROW_AVX512BW #define HAS_RGB24TOARGBROW_AVX512BW @@ -398,6 +391,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ (defined(CLANG_HAS_AVX512)) #define HAS_I422TOARGBROW_AVX512BW +#define HAS_ARGBSHUFFLEROW_AVX512BW #define HAS_ARGBTOUV444ROW_AVX512BW #define HAS_ARGBTOUV444MATRIXROW_AVX512BW #define HAS_ARGBTOYROW_AVX512BW @@ -454,6 +448,7 @@ extern "C" { #define HAS_RGB565TOYMATRIXROW_NEON #define HAS_ARGB1555TOYMATRIXROW_NEON #define HAS_ARGB4444TOYMATRIXROW_NEON +#define HAS_RGBTOYMATRIXROW_NEON #endif #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOVUROW_NEON @@ -2216,6 +2211,7 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, int width, const struct ArgbConstants* c); void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2237,19 +2233,86 @@ void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void RGBToYMatrixRow_Any_AVX2(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void RGB565ToYMatrixRow_Any_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToYMatrixRow_Any_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGB1555ToUVMatrixRow_Any_AVX2(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB4444ToYMatrixRow_Any_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGB4444ToUVMatrixRow_Any_AVX2(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGB565ToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c); void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c); void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c); void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c); +void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c); +void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c); +void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, @@ -2278,6 +2341,9 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, const struct ArgbConstants* c); void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c); +void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); + void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, uint8_t* dst_y, int width, @@ -4048,6 +4114,10 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); +void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width); void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, @@ -4068,6 +4138,10 @@ void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); +void ARGBShuffleRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const uint8_t* param, + int width); void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, @@ -4086,12 +4160,11 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); + void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); @@ -4177,10 +4250,21 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24ToARGBRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RAWToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RGB24ToARGBRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4188,15 +4272,7 @@ void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); + void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4255,9 +4331,7 @@ void ARGB4444ToARGBRow_Any_LASX(const uint8_t* src_ptr, void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); + void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); @@ -4270,10 +4344,7 @@ void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width); -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - uint32_t dither4, - int width); + void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, uint32_t dither4, @@ -4843,11 +4914,7 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); -void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); + void NV12ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -4866,11 +4933,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); -void NV12ToRGB565Row_AVX2(const uint8_t* src_y, - const uint8_t* src_uv, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); + void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, @@ -4945,42 +5008,7 @@ void I422ToRGBARow_SSSE3(const uint8_t* y_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb4444, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_argb1555, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_SSSE3(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_AVX2(const uint8_t* src_y, - const uint8_t* src_u, - const uint8_t* src_v, - uint8_t* dst_rgb565, - const struct YuvConstants* yuvconstants, - int width); + void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5235,16 +5263,7 @@ void NV21ToYUV24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); -void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* uv_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); + void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, @@ -5307,42 +5326,7 @@ void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); -void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); -void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, - const uint8_t* u_buf, - const uint8_t* v_buf, - uint8_t* dst_ptr, - const struct YuvConstants* yuvconstants, - int width); + void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -5609,15 +5593,7 @@ void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); -void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); + void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -5631,10 +5607,7 @@ void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - const uint32_t param, - int width); + void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 955f568e6..d90f894f7 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1945 +#define LIBYUV_VERSION 1946 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 84d2f75db..9ad364e11 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2136,10 +2136,18 @@ ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { @@ -2522,22 +2530,34 @@ int RGB24ToI420(const uint8_t* src_rgb24, #if defined(HAS_RGBTOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGBToYMatrixRow = RGBToYMatrixRow_AVX2; + RGBToYMatrixRow = RGBToYMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBToYMatrixRow = RGBToYMatrixRow_AVX2; + } } #endif #if defined(HAS_RGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGBToUVMatrixRow = RGBToUVMatrixRow_AVX2; + RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_AVX2; + } } #endif #if defined(HAS_RGBTOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGBToUVMatrixRow = RGBToUVMatrixRow_NEON; + RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_NEON; + } } #endif #if defined(HAS_RGBTOYMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGBToYMatrixRow = RGBToYMatrixRow_NEON; + RGBToYMatrixRow = RGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBToYMatrixRow = RGBToYMatrixRow_NEON; + } } #endif #if defined(HAS_RGBTOYMATRIXROW_LSX) @@ -2825,22 +2845,34 @@ int RAWToI420(const uint8_t* src_rgb24, #if defined(HAS_RGBTOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGBToYMatrixRow = RGBToYMatrixRow_AVX2; + RGBToYMatrixRow = RGBToYMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBToYMatrixRow = RGBToYMatrixRow_AVX2; + } } #endif #if defined(HAS_RGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGBToUVMatrixRow = RGBToUVMatrixRow_AVX2; + RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_AVX2; + } } #endif #if defined(HAS_RGBTOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGBToUVMatrixRow = RGBToUVMatrixRow_NEON; + RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_NEON; + } } #endif #if defined(HAS_RGBTOYMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGBToYMatrixRow = RGBToYMatrixRow_NEON; + RGBToYMatrixRow = RGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBToYMatrixRow = RGBToYMatrixRow_NEON; + } } #endif #if defined(HAS_RGBTOYMATRIXROW_LSX) @@ -3565,22 +3597,34 @@ int RGB565ToI420(const uint8_t* src_rgb565, #if defined(HAS_RGB565TOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGB565ToYMatrixRow = RGB565ToYMatrixRow_AVX2; + RGB565ToYMatrixRow = RGB565ToYMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB565ToYMatrixRow = RGB565ToYMatrixRow_AVX2; + } } #endif #if defined(HAS_RGB565TOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_AVX2; + RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_AVX2; + } } #endif #if defined(HAS_RGB565TOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_NEON; + RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVMatrixRow = RGB565ToUVMatrixRow_NEON; + } } #endif #if defined(HAS_RGB565TOYMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - RGB565ToYMatrixRow = RGB565ToYMatrixRow_NEON; + RGB565ToYMatrixRow = RGB565ToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToYMatrixRow = RGB565ToYMatrixRow_NEON; + } } #endif @@ -3622,30 +3666,44 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, int width, int height) { int y; - void (*ARGB1555ToUVMatrixRow)(const uint8_t* src_argb1555, int src_stride_argb1555, - uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = ARGB1555ToUVMatrixRow_C; - void (*ARGB1555ToYMatrixRow)(const uint8_t* src_argb1555, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGB1555ToYMatrixRow_C; + void (*ARGB1555ToUVMatrixRow)( + const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, + uint8_t* dst_v, int width, + const struct ArgbConstants* c) = ARGB1555ToUVMatrixRow_C; + void (*ARGB1555ToYMatrixRow)( + const uint8_t* src_argb1555, uint8_t* dst_y, int width, + const struct ArgbConstants* c) = ARGB1555ToYMatrixRow_C; #if defined(HAS_ARGB1555TOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_AVX2; + ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_AVX2; + } } #endif #if defined(HAS_ARGB1555TOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_AVX2; + ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_AVX2; + } } #endif #if defined(HAS_ARGB1555TOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_NEON; + ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVMatrixRow = ARGB1555ToUVMatrixRow_NEON; + } } #endif #if defined(HAS_ARGB1555TOYMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_NEON; + ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToYMatrixRow = ARGB1555ToYMatrixRow_NEON; + } } #endif @@ -3687,30 +3745,44 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, int width, int height) { int y; - void (*ARGB4444ToUVMatrixRow)(const uint8_t* src_argb4444, int src_stride_argb4444, - uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = ARGB4444ToUVMatrixRow_C; - void (*ARGB4444ToYMatrixRow)(const uint8_t* src_argb4444, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGB4444ToYMatrixRow_C; + void (*ARGB4444ToUVMatrixRow)( + const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, + uint8_t* dst_v, int width, + const struct ArgbConstants* c) = ARGB4444ToUVMatrixRow_C; + void (*ARGB4444ToYMatrixRow)( + const uint8_t* src_argb4444, uint8_t* dst_y, int width, + const struct ArgbConstants* c) = ARGB4444ToYMatrixRow_C; #if defined(HAS_ARGB4444TOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_AVX2; + ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_AVX2; + } } #endif #if defined(HAS_ARGB4444TOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_AVX2; + ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_AVX2; + } } #endif #if defined(HAS_ARGB4444TOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_NEON; + ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVMatrixRow = ARGB4444ToUVMatrixRow_NEON; + } } #endif #if defined(HAS_ARGB4444TOYMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_NEON; + ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToYMatrixRow = ARGB4444ToYMatrixRow_NEON; + } } #endif diff --git a/source/convert_argb.cc b/source/convert_argb.cc index f7d9e9194..4f5059ac2 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3889,14 +3889,7 @@ int RGB565ToARGB(const uint8_t* src_rgb565, height = 1; src_stride_rgb565 = dst_stride_argb = 0; } -#if defined(HAS_RGB565TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - RGB565ToARGBRow = RGB565ToARGBRow_SSE2; - } - } -#endif + #if defined(HAS_RGB565TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; @@ -3965,14 +3958,7 @@ int ARGB1555ToARGB(const uint8_t* src_argb1555, height = 1; src_stride_argb1555 = dst_stride_argb = 0; } -#if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; - } - } -#endif + #if defined(HAS_ARGB1555TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; @@ -4046,14 +4032,7 @@ int ARGB4444ToARGB(const uint8_t* src_argb4444, height = 1; src_stride_argb4444 = dst_stride_argb = 0; } -#if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; - } - } -#endif + #if defined(HAS_ARGB4444TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 612821996..08a0cd825 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -315,10 +315,18 @@ ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { @@ -477,7 +485,7 @@ ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } @@ -692,7 +700,7 @@ ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } @@ -1411,14 +1419,7 @@ int ARGBToRGB565Dither(const uint8_t* src_argb, if (!dither4x4) { dither4x4 = kDither565_4x4; } -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; - } - } -#endif + #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; @@ -1494,14 +1495,7 @@ int ARGBToRGB565(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_rgb565 = 0; } -#if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; - } - } -#endif + #if defined(HAS_ARGBTORGB565ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; @@ -1575,14 +1569,7 @@ int ARGBToARGB1555(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb1555 = 0; } -#if defined(HAS_ARGBTOARGB1555ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; - } - } -#endif + #if defined(HAS_ARGBTOARGB1555ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; @@ -1650,14 +1637,7 @@ int ARGBToARGB4444(const uint8_t* src_argb, height = 1; src_stride_argb = dst_stride_argb4444 = 0; } -#if defined(HAS_ARGBTOARGB4444ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; - } - } -#endif + #if defined(HAS_ARGBTOARGB4444ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; @@ -2345,7 +2325,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw, #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } @@ -2353,7 +2333,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw, #if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) if (TestCpuFlag(kCpuHasAVX512BW)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_AVX512BW; - if (IS_ALIGNED(width, 32)) { + if (IS_ALIGNED(width, 64)) { ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX512BW; } } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 02757f3b0..5bd5fd911 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -4601,6 +4601,14 @@ int ARGBShuffle(const uint8_t* src_argb, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + ARGBShuffleRow = ARGBShuffleRow_Any_AVX512BW; + if (IS_ALIGNED(width, 32)) { + ARGBShuffleRow = ARGBShuffleRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBSHUFFLEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBShuffleRow = ARGBShuffleRow_Any_NEON; @@ -4679,6 +4687,14 @@ int AR64Shuffle(const uint16_t* src_ar64, } } #endif +#if defined(HAS_ARGBSHUFFLEROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + AR64ShuffleRow = ARGBShuffleRow_Any_AVX512BW; + if (IS_ALIGNED(width, 16)) { + AR64ShuffleRow = ARGBShuffleRow_AVX512BW; + } + } +#endif #if defined(HAS_ARGBSHUFFLEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { AR64ShuffleRow = ARGBShuffleRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 70b83e4e5..cac6339d1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -945,9 +945,7 @@ ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) #if defined(HAS_ARGBTORGB24ROW_SSSE3) ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) -ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) -ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) + #endif #if defined(HAS_ARGBTORGB24ROW_AVX2) ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) @@ -993,13 +991,14 @@ ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #if defined(HAS_RGB24TOARGBROW_SSSE3) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) -ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) -ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) + #endif #if defined(HAS_RAWTOARGBROW_AVX2) ANY11(RAWToARGBRow_Any_AVX2, RAWToARGBRow_AVX2, 0, 3, 4, 31) #endif +#if defined(HAS_RGB24TOARGBROW_AVX2) +ANY11(RGB24ToARGBRow_Any_AVX2, RGB24ToARGBRow_AVX2, 0, 3, 4, 31) +#endif #if defined(HAS_RAWTOARGBROW_AVX512BW) ANY11(RAWToARGBRow_Any_AVX512BW, RAWToARGBRow_AVX512BW, 0, 3, 4, 63) #endif @@ -1415,8 +1414,8 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ - SIMD_ALIGNED(uint8_t vin[64]); \ - SIMD_ALIGNED(uint8_t vout[64]); \ + SIMD_ALIGNED(uint8_t vin[(MASK + 1) * SBPP]); \ + SIMD_ALIGNED(uint8_t vout[(MASK + 1) * BPP]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -1462,14 +1461,6 @@ ANY11P(I400ToARGBRow_Any_LSX, 15) #endif -#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) -ANY11P(ARGBToRGB565DitherRow_Any_SSE2, - ARGBToRGB565DitherRow_SSE2, - const uint32_t, - 4, - 2, - 3) -#endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, @@ -1508,6 +1499,14 @@ ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #ifdef HAS_ARGBSHUFFLEROW_AVX2 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif +#ifdef HAS_ARGBSHUFFLEROW_AVX512BW +ANY11P(ARGBShuffleRow_Any_AVX512BW, + ARGBShuffleRow_AVX512BW, + const uint8_t*, + 4, + 4, + 31) +#endif #ifdef HAS_ARGBSHUFFLEROW_NEON ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #endif @@ -2266,8 +2265,28 @@ ANY12MS(ARGBToUVMatrixRow_Any_NEON, ARGBToUVMatrixRow_NEON, 0, 4, 15) #ifdef HAS_ARGBTOUVMATRIXROW_NEON_I8MM ANY12MS(ARGBToUVMatrixRow_Any_NEON_I8MM, ARGBToUVMatrixRow_NEON_I8MM, 0, 4, 15) #endif +#ifdef HAS_RGBTOUVMATRIXROW_NEON +ANY12MS(RGBToUVMatrixRow_Any_NEON, RGBToUVMatrixRow_NEON, 0, 3, 15) +#endif +#ifdef HAS_RGB565TOUVMATRIXROW_NEON +ANY12MS(RGB565ToUVMatrixRow_Any_NEON, RGB565ToUVMatrixRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_ARGB1555TOUVMATRIXROW_NEON +ANY12MS(ARGB1555ToUVMatrixRow_Any_NEON, ARGB1555ToUVMatrixRow_NEON, 0, 2, 15) +#endif +#ifdef HAS_ARGB4444TOUVMATRIXROW_NEON +ANY12MS(ARGB4444ToUVMatrixRow_Any_NEON, ARGB4444ToUVMatrixRow_NEON, 0, 2, 15) +#endif #ifdef HAS_ARGBTOUVMATRIXROW_AVX2 -ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 15) +ANY12MS(ARGBToUVMatrixRow_Any_AVX2, ARGBToUVMatrixRow_AVX2, 0, 4, 31) +ANY12MS(RGBToUVMatrixRow_Any_AVX2, RGBToUVMatrixRow_AVX2, 0, 3, 31) +ANY12MS(RGB565ToUVMatrixRow_Any_AVX2, RGB565ToUVMatrixRow_AVX2, 0, 2, 31) +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +ANY12MS(ARGB1555ToUVMatrixRow_Any_AVX2, ARGB1555ToUVMatrixRow_AVX2, 0, 2, 31) +#endif +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +ANY12MS(ARGB4444ToUVMatrixRow_Any_AVX2, ARGB4444ToUVMatrixRow_AVX2, 0, 2, 31) +#endif #endif #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63) @@ -2309,6 +2328,14 @@ ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15) #endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31) +ANY11MC(RGBToYMatrixRow_Any_AVX2, RGBToYMatrixRow_AVX2, 3, 31) +ANY11MC(RGB565ToYMatrixRow_Any_AVX2, RGB565ToYMatrixRow_AVX2, 2, 31) +#ifdef HAS_ARGB1555TOYMATRIXROW_AVX2 +ANY11MC(ARGB1555ToYMatrixRow_Any_AVX2, ARGB1555ToYMatrixRow_AVX2, 2, 31) +#endif +#ifdef HAS_ARGB4444TOYMATRIXROW_AVX2 +ANY11MC(ARGB4444ToYMatrixRow_Any_AVX2, ARGB4444ToYMatrixRow_AVX2, 2, 31) +#endif #endif #ifdef HAS_ARGBTOYROW_AVX512BW ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63) @@ -2319,6 +2346,18 @@ ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15) #ifdef HAS_ARGBTOYMATRIXROW_NEON_DOTPROD ANY11MC(ARGBToYMatrixRow_Any_NEON_DotProd, ARGBToYMatrixRow_NEON_DotProd, 4, 15) #endif +#ifdef HAS_RGBTOYMATRIXROW_NEON +ANY11MC(RGBToYMatrixRow_Any_NEON, RGBToYMatrixRow_NEON, 3, 15) +#endif +#ifdef HAS_RGB565TOYMATRIXROW_NEON +ANY11MC(RGB565ToYMatrixRow_Any_NEON, RGB565ToYMatrixRow_NEON, 2, 15) +#endif +#ifdef HAS_ARGB1555TOYMATRIXROW_NEON +ANY11MC(ARGB1555ToYMatrixRow_Any_NEON, ARGB1555ToYMatrixRow_NEON, 2, 15) +#endif +#ifdef HAS_ARGB4444TOYMATRIXROW_NEON +ANY11MC(ARGB4444ToYMatrixRow_Any_NEON, ARGB4444ToYMatrixRow_NEON, 2, 15) +#endif #ifdef HAS_ARGBTOYMATRIXROW_LSX ANY11MC(ARGBToYMatrixRow_Any_LSX, ARGBToYMatrixRow_LSX, 4, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index 6b60aac12..a18c90d12 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -4170,7 +4170,7 @@ void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, } #endif -#if defined(HAS_NV12TORGB24ROW_AVX2) +#if defined(HAS_NV12TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) void NV12ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, @@ -4181,11 +4181,7 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif src_y += twidth; src_uv += twidth; dst_rgb24 += twidth * 3; @@ -4194,7 +4190,7 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_NV21TORGB24ROW_AVX2) +#if defined(HAS_NV21TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) void NV21ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, @@ -4205,11 +4201,7 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif src_y += twidth; src_vu += twidth; dst_rgb24 += twidth * 3; @@ -4218,7 +4210,7 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I422TORGB565ROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB565ROW_AVX2) void I422ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4229,11 +4221,7 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); -#else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); -#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4243,7 +4231,7 @@ void I422ToRGB565Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGB1555ROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTOARGB1555ROW_AVX2) void I422ToARGB1555Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4255,11 +4243,7 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTOARGB1555ROW_AVX2) ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); -#else - ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); -#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4269,7 +4253,7 @@ void I422ToARGB1555Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I422TOARGB4444ROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTOARGB4444ROW_AVX2) void I422ToARGB4444Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4281,11 +4265,7 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTOARGB4444ROW_AVX2) ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); -#else - ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); -#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4295,7 +4275,7 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I422TORGB24ROW_AVX2) +#if defined(HAS_I422TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4307,11 +4287,7 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; @@ -4321,7 +4297,7 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_I444TORGB24ROW_AVX2) +#if defined(HAS_I444TOARGBROW_AVX2) && defined(HAS_ARGBTORGB24ROW_AVX2) void I444ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -4333,11 +4309,7 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I444ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); -#else - ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); -#endif src_y += twidth; src_u += twidth; src_v += twidth; @@ -4347,7 +4319,7 @@ void I444ToRGB24Row_AVX2(const uint8_t* src_y, } #endif -#if defined(HAS_NV12TORGB565ROW_AVX2) +#if defined(HAS_NV12TOARGBROW_AVX2) && defined(HAS_ARGBTORGB565ROW_AVX2) void NV12ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, @@ -4358,11 +4330,7 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); -#if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); -#else - ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); -#endif src_y += twidth; src_uv += twidth; dst_rgb565 += twidth * 2; @@ -4622,7 +4590,7 @@ void RGBToUVMatrixRow_C(const uint8_t* src_rgb, } } -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) +#if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2) void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_y, int width, @@ -4630,12 +4598,8 @@ void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_RGB24TOARGBROW_SSSE3) - RGB24ToARGBRow_Any_SSSE3(src_rgb, row, twidth); -#else - RGB24ToARGBRow_C(src_rgb, row, twidth); -#endif - ARGBToYMatrixRow_Any_AVX2(row, dst_y, twidth, c); + RGB24ToARGBRow_AVX2(src_rgb, row, twidth); + ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c); src_rgb += twidth * 3; dst_y += twidth; width -= twidth; @@ -4643,7 +4607,7 @@ void RGBToYMatrixRow_AVX2(const uint8_t* src_rgb, } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) +#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB24TOARGBROW_AVX2) void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, @@ -4653,14 +4617,10 @@ void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_RGB24TOARGBROW_SSSE3) - RGB24ToARGBRow_Any_SSSE3(src_rgb, row, twidth); - RGB24ToARGBRow_Any_SSSE3(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth); -#else - RGB24ToARGBRow_C(src_rgb, row, twidth); - RGB24ToARGBRow_C(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + RGB24ToARGBRow_AVX2(src_rgb, row, twidth); + RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, + row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb += twidth * 3; dst_u += twidth / 2; dst_v += twidth / 2; @@ -4679,9 +4639,9 @@ void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - RGB24ToARGBRow_Any_NEON(src_rgb, row, twidth); - RGB24ToARGBRow_Any_NEON(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth); - ARGBToUVMatrixRow_Any_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + RGB24ToARGBRow_NEON(src_rgb, row, twidth); + RGB24ToARGBRow_NEON(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb += twidth * 3; dst_u += twidth / 2; dst_v += twidth / 2; @@ -4724,7 +4684,7 @@ void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565, } } -#if defined(HAS_ARGBTOYMATRIXROW_AVX2) +#if defined(HAS_ARGBTOYMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2) void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_y, int width, @@ -4732,12 +4692,8 @@ void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_RGB565TOARGBROW_SSE2) - RGB565ToARGBRow_Any_SSE2(src_rgb565, row, twidth); -#else - RGB565ToARGBRow_C(src_rgb565, row, twidth); -#endif - ARGBToYMatrixRow_Any_AVX2(row, dst_y, twidth, c); + RGB565ToARGBRow_AVX2(src_rgb565, row, twidth); + ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c); src_rgb565 += twidth * 2; dst_y += twidth; width -= twidth; @@ -4745,7 +4701,7 @@ void RGB565ToYMatrixRow_AVX2(const uint8_t* src_rgb565, } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) +#if defined(HAS_ARGBTOUVMATRIXROW_AVX2) && defined(HAS_RGB565TOARGBROW_AVX2) void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, @@ -4755,14 +4711,10 @@ void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_RGB565TOARGBROW_SSE2) - RGB565ToARGBRow_Any_SSE2(src_rgb565, row, twidth); - RGB565ToARGBRow_Any_SSE2(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); -#else - RGB565ToARGBRow_C(src_rgb565, row, twidth); - RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + RGB565ToARGBRow_AVX2(src_rgb565, row, twidth); + RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, + row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb565 += twidth * 2; dst_u += twidth / 2; dst_v += twidth / 2; @@ -4771,7 +4723,7 @@ void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, } #endif -#if defined(HAS_RGB565TOYMATRIXROW_NEON) && defined(HAS_ARGBTOYMATRIXROW_NEON) +#if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOYMATRIXROW_NEON) void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, @@ -4779,12 +4731,8 @@ void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_RGB565TOARGBROW_NEON) - RGB565ToARGBRow_Any_NEON(src_rgb565, row, twidth); -#else - RGB565ToARGBRow_C(src_rgb565, row, twidth); -#endif - ARGBToYMatrixRow_Any_NEON(row, dst_y, twidth, c); + RGB565ToARGBRow_NEON(src_rgb565, row, twidth); + ARGBToYMatrixRow_NEON(row, dst_y, twidth, c); src_rgb565 += twidth * 2; dst_y += twidth; width -= twidth; @@ -4792,7 +4740,7 @@ void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, } #endif -#if defined(HAS_RGB565TOUVMATRIXROW_NEON) +#if defined(HAS_RGB565TOARGBROW_NEON) && defined(HAS_ARGBTOUVMATRIXROW_NEON) void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, @@ -4802,14 +4750,9 @@ void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_RGB565TOARGBROW_NEON) - RGB565ToARGBRow_Any_NEON(src_rgb565, row, twidth); - RGB565ToARGBRow_Any_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); -#else - RGB565ToARGBRow_C(src_rgb565, row, twidth); - RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + RGB565ToARGBRow_NEON(src_rgb565, row, twidth); + RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb565 += twidth * 2; dst_u += twidth / 2; dst_v += twidth / 2; @@ -4818,7 +4761,6 @@ void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, } #endif - void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width, @@ -4888,6 +4830,7 @@ void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444, } #if defined(HAS_ARGBTOYMATRIXROW_AVX2) +#if defined(HAS_ARGB1555TOARGBROW_AVX2) void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_y, int width, @@ -4895,20 +4838,16 @@ void ARGB1555ToYMatrixRow_AVX2(const uint8_t* src_argb1555, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB1555TOARGBROW_AVX2) - ARGB1555ToARGBRow_Any_AVX2(src_argb1555, row, twidth); -#elif defined(HAS_ARGB1555TOARGBROW_SSE2) - ARGB1555ToARGBRow_Any_SSE2(src_argb1555, row, twidth); -#else - ARGB1555ToARGBRow_C(src_argb1555, row, twidth); -#endif - ARGBToYMatrixRow_Any_AVX2(row, dst_y, twidth, c); + ARGB1555ToARGBRow_AVX2(src_argb1555, row, twidth); + ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c); src_argb1555 += twidth * 2; dst_y += twidth; width -= twidth; } } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_y, int width, @@ -4916,22 +4855,18 @@ void ARGB4444ToYMatrixRow_AVX2(const uint8_t* src_argb4444, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB4444TOARGBROW_AVX2) - ARGB4444ToARGBRow_Any_AVX2(src_argb4444, row, twidth); -#elif defined(HAS_ARGB4444TOARGBROW_SSE2) - ARGB4444ToARGBRow_Any_SSE2(src_argb4444, row, twidth); -#else - ARGB4444ToARGBRow_C(src_argb4444, row, twidth); -#endif - ARGBToYMatrixRow_Any_AVX2(row, dst_y, twidth, c); + ARGB4444ToARGBRow_AVX2(src_argb4444, row, twidth); + ARGBToYMatrixRow_AVX2(row, dst_y, twidth, c); src_argb4444 += twidth * 2; dst_y += twidth; width -= twidth; } } #endif +#endif #if defined(HAS_ARGBTOUVMATRIXROW_AVX2) +#if defined(HAS_ARGB1555TOARGBROW_AVX2) void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, @@ -4941,24 +4876,19 @@ void ARGB1555ToUVMatrixRow_AVX2(const uint8_t* src_argb1555, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB1555TOARGBROW_AVX2) - ARGB1555ToARGBRow_Any_AVX2(src_argb1555, row, twidth); - ARGB1555ToARGBRow_Any_AVX2(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); -#elif defined(HAS_ARGB1555TOARGBROW_SSE2) - ARGB1555ToARGBRow_Any_SSE2(src_argb1555, row, twidth); - ARGB1555ToARGBRow_Any_SSE2(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); -#else - ARGB1555ToARGBRow_C(src_argb1555, row, twidth); - ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + ARGB1555ToARGBRow_AVX2(src_argb1555, row, twidth); + ARGB1555ToARGBRow_AVX2(src_argb1555 + src_stride_argb1555, + row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb1555 += twidth * 2; dst_u += twidth / 2; dst_v += twidth / 2; width -= twidth; } } +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, @@ -4968,17 +4898,10 @@ void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB4444TOARGBROW_AVX2) - ARGB4444ToARGBRow_Any_AVX2(src_argb4444, row, twidth); - ARGB4444ToARGBRow_Any_AVX2(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); -#elif defined(HAS_ARGB4444TOARGBROW_SSE2) - ARGB4444ToARGBRow_Any_SSE2(src_argb4444, row, twidth); - ARGB4444ToARGBRow_Any_SSE2(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); -#else - ARGB4444ToARGBRow_C(src_argb4444, row, twidth); - ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + ARGB4444ToARGBRow_AVX2(src_argb4444, row, twidth); + ARGB4444ToARGBRow_AVX2(src_argb4444 + src_stride_argb4444, + row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb4444 += twidth * 2; dst_u += twidth / 2; dst_v += twidth / 2; @@ -4986,8 +4909,9 @@ void ARGB4444ToUVMatrixRow_AVX2(const uint8_t* src_argb4444, } } #endif +#endif -#if defined(HAS_ARGBTOYMATRIXROW_NEON) +#if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON) void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, @@ -4995,18 +4919,16 @@ void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB1555TOARGBROW_NEON) - ARGB1555ToARGBRow_Any_NEON(src_argb1555, row, twidth); -#else - ARGB1555ToARGBRow_C(src_argb1555, row, twidth); -#endif - ARGBToYMatrixRow_Any_NEON(row, dst_y, twidth, c); + ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth); + ARGBToYMatrixRow_NEON(row, dst_y, twidth, c); src_argb1555 += twidth * 2; dst_y += twidth; width -= twidth; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON) void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, @@ -5014,12 +4936,8 @@ void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB4444TOARGBROW_NEON) - ARGB4444ToARGBRow_Any_NEON(src_argb4444, row, twidth); -#else - ARGB4444ToARGBRow_C(src_argb4444, row, twidth); -#endif - ARGBToYMatrixRow_Any_NEON(row, dst_y, twidth, c); + ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth); + ARGBToYMatrixRow_NEON(row, dst_y, twidth, c); src_argb4444 += twidth * 2; dst_y += twidth; width -= twidth; @@ -5027,7 +4945,7 @@ void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, } #endif -#if defined(HAS_ARGBTOUVMATRIXROW_NEON) +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB1555TOARGBROW_NEON) void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, @@ -5037,21 +4955,18 @@ void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB1555TOARGBROW_NEON) - ARGB1555ToARGBRow_Any_NEON(src_argb1555, row, twidth); - ARGB1555ToARGBRow_Any_NEON(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); -#else - ARGB1555ToARGBRow_C(src_argb1555, row, twidth); - ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth); + ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb1555 += twidth * 2; dst_u += twidth / 2; dst_v += twidth / 2; width -= twidth; } } +#endif +#if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_ARGB4444TOARGBROW_NEON) void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, @@ -5061,14 +4976,9 @@ void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; -#if defined(HAS_ARGB4444TOARGBROW_NEON) - ARGB4444ToARGBRow_Any_NEON(src_argb4444, row, twidth); - ARGB4444ToARGBRow_Any_NEON(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); -#else - ARGB4444ToARGBRow_C(src_argb4444, row, twidth); - ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); -#endif - ARGBToUVMatrixRow_Any_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth); + ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); + ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb4444 += twidth * 2; dst_u += twidth / 2; dst_v += twidth / 2; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 7d7f1df39..e37e58b01 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -33,7 +33,6 @@ extern "C" { static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; - #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) @@ -52,8 +51,10 @@ static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, #ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. -static const uvec8 kShuffleMaskRGB24ToARGB = { - 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; +static const uvec8 kShuffleMaskRGB24ToARGB[2] = { + {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u}, + {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u, + 128u}}; // Shuffle table for converting RAW to ARGB. static const uvec8 kShuffleMaskRAWToARGB = { @@ -223,10 +224,57 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "m"(kShuffleMaskRGB24ToARGB) // %3 + : "m"(kShuffleMaskRGB24ToARGB[0]) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } +#ifdef HAS_RGB24TOARGBROW_AVX2 +void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { + // Reference to prevent discarding of kShuffleMaskRGB24ToARGB[1] which is + // accessed via offset in assembly. + const uvec8* dummy = &kShuffleMaskRGB24ToARGB[1]; + (void)dummy; + asm volatile( + "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0xff000000 + "vpslld $0x18,%%ymm6,%%ymm6 \n" + "vbroadcasti128 %3,%%ymm4 \n" + "vbroadcasti128 16+%3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // first 12 + "vinserti128 $1,12(%0),%%ymm0,%%ymm0 \n" // second 12 + "vmovdqu 24(%0),%%xmm1 \n" // third 12 + "vinserti128 $1,36(%0),%%ymm1,%%ymm1 \n" // forth 12 + "vmovdqu 48(%0),%%xmm2 \n" // fifth 12 + "vinserti128 $1,60(%0),%%ymm2,%%ymm2 \n" // sixth 12 + "vmovdqu 68(%0),%%xmm3 \n" // seventh 12 + "vinserti128 $1,80(%0),%%ymm3,%%ymm3 \n" // eighth 12 + "lea 96(%0),%0 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm1,%%ymm1 \n" + "vpor %%ymm6,%%ymm2,%%ymm2 \n" + "vpor %%ymm6,%%ymm3,%%ymm3 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "vmovdqu %%ymm2,0x40(%1) \n" + "vmovdqu %%ymm3,0x60(%1) \n" + "lea 0x80(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskRGB24ToARGB[0]) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif // HAS_RGB24TOARGBROW_AVX2 + void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "pcmpeqb %%xmm6,%%xmm6 \n" // 0xff000000 @@ -359,11 +407,11 @@ void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) } void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB, width); + RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, + (const uint32_t*)&kShuffleMaskRGB24ToARGB[0], width); } #endif - // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( @@ -435,46 +483,47 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { +#ifdef HAS_RGB565TOARGBROW_AVX2 +void RGB565ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" + "vmovd %%eax,%%xmm5 \n" + "vpbroadcastd %%xmm5,%%ymm5 \n" "mov $0x20802080,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $10,%%xmm4 \n" - "psrlw $5,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" + "vmovd %%eax,%%xmm6 \n" + "vpbroadcastd %%xmm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsllw $0xb,%%ymm3,%%ymm3 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpsllw $10,%%ymm4,%%ymm4 \n" + "vpsrlw $5,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" + "vpsllw $0x8,%%ymm7,%%ymm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "pand %%xmm4,%%xmm0 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" + "vmovdqu (%0),%%ymm0 \n" + "vpand %%ymm3,%%ymm0,%%ymm1 \n" + "vpsllw $0xb,%%ymm0,%%ymm2 \n" + "vpmulhuw %%ymm5,%%ymm1,%%ymm1 \n" + "vpmulhuw %%ymm5,%%ymm2,%%ymm2 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpand %%ymm4,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" + "vpor %%ymm7,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm1,%%ymm2 \n" + "vpunpckhbw %%ymm0,%%ymm1,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm1,%%ymm2,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0,(%1,%0,2) \n" + "vmovdqu %%ymm1,0x20(%1,%0,2) \n" + "lea 0x20(%0),%0 \n" + "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -482,50 +531,50 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } +#endif -void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +void ARGB1555ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "mov $0x1080108,%%eax \n" - "movd %%eax,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" + "vmovd %%eax,%%xmm5 \n" + "vpbroadcastd %%xmm5,%%ymm5 \n" "mov $0x42004200,%%eax \n" - "movd %%eax,%%xmm6 \n" - "pshufd $0x0,%%xmm6,%%xmm6 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psllw $0xb,%%xmm3 \n" - "movdqa %%xmm3,%%xmm4 \n" - "psrlw $0x6,%%xmm4 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "psllw $0x8,%%xmm7 \n" + "vmovd %%eax,%%xmm6 \n" + "vpbroadcastd %%xmm6,%%ymm6 \n" + "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" + "vpsllw $0xb,%%ymm3,%%ymm3 \n" + "vpsrlw $0x6,%%ymm3,%%ymm4 \n" + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" + "vpsllw $0x8,%%ymm7,%%ymm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "psllw $0x1,%%xmm1 \n" - "psllw $0xb,%%xmm2 \n" - "pand %%xmm3,%%xmm1 \n" - "pmulhuw %%xmm5,%%xmm2 \n" - "pmulhuw %%xmm5,%%xmm1 \n" - "psllw $0x8,%%xmm1 \n" - "por %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "psraw $0x8,%%xmm2 \n" - "pmulhuw %%xmm6,%%xmm0 \n" - "pand %%xmm7,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "movdqa %%xmm1,%%xmm2 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,0x00(%1,%0,2) \n" - "movdqu %%xmm2,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" + "vmovdqu (%0),%%ymm0 \n" + "vpsllw $0x1,%%ymm0,%%ymm1 \n" + "vpsllw $0xb,%%ymm0,%%ymm2 \n" + "vpand %%ymm3,%%ymm1,%%ymm1 \n" + "vpmulhuw %%ymm5,%%ymm2,%%ymm2 \n" + "vpmulhuw %%ymm5,%%ymm1,%%ymm1 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm2,%%ymm1,%%ymm1 \n" + "vpsraw $0x8,%%ymm0,%%ymm2 \n" + "vpand %%ymm4,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" + "vpand %%ymm7,%%ymm2,%%ymm2 \n" + "vpor %%ymm2,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm0,%%ymm1,%%ymm2 \n" + "vpunpckhbw %%ymm0,%%ymm1,%%ymm1 \n" + "vperm2i128 $0x20,%%ymm1,%%ymm2,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm1,%%ymm2,%%ymm1 \n" + "vmovdqu %%ymm0,(%1,%0,2) \n" + "vmovdqu %%ymm1,0x20(%1,%0,2) \n" + "lea 0x20(%0),%0 \n" + "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -533,43 +582,44 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } +#endif -void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +void ARGB4444ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( - "mov $0xf0f0f0f,%%eax \n" - "movd %%eax,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x4,%%xmm5 \n" + "mov $0x0f0f0f0f,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vpbroadcastd %%xmm4,%%ymm4 \n" + "vpslld $0x4,%%ymm4,%%ymm5 \n" "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pand %%xmm4,%%xmm0 \n" - "pand %%xmm5,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "psllw $0x4,%%xmm1 \n" - "psrlw $0x4,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,0x00(%1,%0,2) \n" - "movdqu %%xmm1,0x10(%1,%0,2) \n" - "lea 0x10(%0),%0 \n" - "sub $0x8,%2 \n" + "vmovdqu (%0),%%ymm0 \n" + "vpand %%ymm5,%%ymm0,%%ymm2 \n" + "vpand %%ymm4,%%ymm0,%%ymm0 \n" + "vpsllw $0x4,%%ymm0,%%ymm1 \n" + "vpsrlw $0x4,%%ymm2,%%ymm3 \n" + "vpor %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" + "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm2 \n" + "vperm2i128 $0x31,%%ymm1,%%ymm0,%%ymm1 \n" + "vmovdqu %%ymm2,(%1,%0,2) \n" + "vmovdqu %%ymm1,0x20(%1,%0,2) \n" + "lea 0x20(%0),%0 \n" + "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } +#endif void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile("movdqa %3,%%xmm6 \n" @@ -790,90 +840,6 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif -void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, - uint8_t* dst, - uint32_t dither4, - int width) { - asm volatile( - "movd %3,%%xmm6 \n" - "punpcklbw %%xmm6,%%xmm6 \n" - "movdqa %%xmm6,%%xmm7 \n" - "punpcklwd %%xmm6,%%xmm6 \n" - "punpckhwd %%xmm7,%%xmm7 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x1b,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1a,%%xmm4 \n" - "pslld $0x5,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0xb,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "paddusb %%xmm6,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "pslld $0x8,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x5,%%xmm2 \n" - "psrad $0x10,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "pand %%xmm4,%%xmm2 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(dither4) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); -} - #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, @@ -920,75 +886,6 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 -void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrld $0x1b,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "pslld $0x5,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "pslld $0xa,%%xmm6 \n" - "pcmpeqb %%xmm7,%%xmm7 \n" - "pslld $0xf,%%xmm7 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm0,%%xmm3 \n" - "psrad $0x10,%%xmm0 \n" - "psrld $0x3,%%xmm1 \n" - "psrld $0x6,%%xmm2 \n" - "psrld $0x9,%%xmm3 \n" - "pand %%xmm7,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "pand %%xmm5,%%xmm2 \n" - "pand %%xmm6,%%xmm3 \n" - "por %%xmm1,%%xmm0 \n" - "por %%xmm3,%%xmm2 \n" - "por %%xmm2,%%xmm0 \n" - "packssdw %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); -} - -void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psllw $0xc,%%xmm4 \n" - "movdqa %%xmm4,%%xmm3 \n" - "psrlw $0x8,%%xmm3 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm4,%%xmm1 \n" - "psrlq $0x4,%%xmm0 \n" - "psrlq $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); -} #endif // HAS_RGB24TOARGBROW_SSSE3 /* @@ -2058,7 +1955,6 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_SSSE3 - #ifdef HAS_ARGBTOYROW_AVX2 void RGBAToYRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { ARGBToYMatrixRow_AVX2(src_rgba, dst_y, width, &kRgbaI601Constants); @@ -2071,7 +1967,6 @@ void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) { } #endif - #ifdef HAS_ARGBTOYROW_AVX512BW void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) { ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbI601Constants); @@ -4814,8 +4709,6 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { } #endif // HAS_MIRRORUVROW_AVX2 - - #ifdef HAS_RGB24MIRRORROW_SSSE3 // Shuffle first 5 pixels to last 5 mirrored. first byte zero @@ -9006,6 +8899,36 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 +#ifdef HAS_ARGBSHUFFLEROW_AVX512BW +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + asm volatile( + "vbroadcasti32x4 (%3),%%zmm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu8 (%0),%%zmm0 \n" + "vmovdqu8 0x40(%0),%%zmm1 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%zmm5,%%zmm0,%%zmm0 \n" + "vpshufb %%zmm5,%%zmm1,%%zmm1 \n" + "vmovdqu8 %%zmm0,(%1) \n" + "vmovdqu8 %%zmm1,0x40(%1) \n" + "lea 0x80(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_ARGBSHUFFLEROW_AVX512BW + #ifdef HAS_I422TOYUY2ROW_SSE2 void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/row_win.cc b/source/row_win.cc index 847d3a04d..28b1dbc7f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -704,6 +704,188 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { } #endif // HAS_J400TOARGBROW_AVX2 +#ifdef HAS_RGB24TOARGBROW_AVX2 +alignas(16) static const uint8_t kShuffleMaskRGB24ToARGB[2][16] = { + {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u}, + {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u, 128u} +}; +#endif + +#ifdef HAS_RGB565TOARGBROW_AVX2 +LIBYUV_TARGET_AVX2 +void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { + __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108); + __m256i ymm_scale_g = _mm256_set1_epi32(0x20802080); + __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800); + __m256i ymm_mask_g = _mm256_set1_epi16(0x07e0); + __m256i ymm_mask_a = _mm256_set1_epi16((short)0xff00); + + while (width > 0) { + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_rgb565); + __m256i ymm1 = ymm0; + __m256i ymm2 = ymm0; + + ymm1 = _mm256_and_si256(ymm1, ymm_mask_b); + ymm2 = _mm256_slli_epi16(ymm2, 11); + ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb); + ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb); + ymm1 = _mm256_slli_epi16(ymm1, 8); + ymm1 = _mm256_or_si256(ymm1, ymm2); // RB + + ymm0 = _mm256_and_si256(ymm0, ymm_mask_g); + ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g); + ymm0 = _mm256_or_si256(ymm0, ymm_mask_a); // GA + + ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0); + ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0); + + ymm0 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20); + ymm1 = _mm256_permute2x128_si256(ymm2, ymm1, 0x31); + + _mm256_storeu_si256((__m256i*)dst_argb, ymm0); + _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1); + + src_rgb565 += 32; + dst_argb += 64; + width -= 16; + } + _mm256_zeroupper(); +} +#endif + +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +LIBYUV_TARGET_AVX2 +void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { + __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108); + __m256i ymm_scale_g = _mm256_set1_epi32(0x42004200); + __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800); + __m256i ymm_mask_g = _mm256_set1_epi16(0x03e0); + __m256i ymm_mask_a = _mm256_set1_epi16((short)0xff00); + + while (width > 0) { + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb1555); + __m256i ymm1 = ymm0; + __m256i ymm2 = ymm0; + + ymm1 = _mm256_slli_epi16(ymm1, 1); + ymm2 = _mm256_slli_epi16(ymm2, 11); + ymm1 = _mm256_and_si256(ymm1, ymm_mask_b); + ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb); + ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb); + ymm1 = _mm256_slli_epi16(ymm1, 8); + ymm1 = _mm256_or_si256(ymm1, ymm2); // RB + + ymm2 = ymm0; + ymm0 = _mm256_and_si256(ymm0, ymm_mask_g); + ymm2 = _mm256_srai_epi16(ymm2, 8); + ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g); + ymm2 = _mm256_and_si256(ymm2, ymm_mask_a); + ymm0 = _mm256_or_si256(ymm0, ymm2); // GA + + ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0); + ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0); + + ymm0 = _mm256_permute2x128_si256(ymm2, ymm1, 0x20); + ymm1 = _mm256_permute2x128_si256(ymm2, ymm1, 0x31); + + _mm256_storeu_si256((__m256i*)dst_argb, ymm0); + _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1); + + src_argb1555 += 32; + dst_argb += 64; + width -= 16; + } + _mm256_zeroupper(); +} +#endif + +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +LIBYUV_TARGET_AVX2 +void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { + __m256i ymm_mask = _mm256_set1_epi32(0x0f0f0f0f); + __m256i ymm_mask2 = _mm256_slli_epi32(ymm_mask, 4); + + while (width > 0) { + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb4444); + __m256i ymm2 = ymm0; + + ymm0 = _mm256_and_si256(ymm0, ymm_mask); + ymm2 = _mm256_and_si256(ymm2, ymm_mask2); + + __m256i ymm1 = ymm0; + __m256i ymm3 = ymm2; + + ymm1 = _mm256_slli_epi16(ymm1, 4); + ymm3 = _mm256_srli_epi16(ymm3, 4); + + ymm0 = _mm256_or_si256(ymm0, ymm1); + ymm2 = _mm256_or_si256(ymm2, ymm3); + + ymm1 = ymm0; + ymm0 = _mm256_unpacklo_epi8(ymm0, ymm2); + ymm1 = _mm256_unpackhi_epi8(ymm1, ymm2); + + ymm2 = _mm256_permute2x128_si256(ymm0, ymm1, 0x20); + ymm1 = _mm256_permute2x128_si256(ymm0, ymm1, 0x31); + + _mm256_storeu_si256((__m256i*)dst_argb, ymm2); + _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1); + + src_argb4444 += 32; + dst_argb += 64; + width -= 16; + } + _mm256_zeroupper(); +} +#endif + +#ifdef HAS_RGB24TOARGBROW_AVX2 +LIBYUV_TARGET_AVX2 +void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { + __m256i ymm_alpha = _mm256_set1_epi32(0xff000000); + __m256i ymm_shuf = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0])); + __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1])); + + while (width > 0) { + __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_rgb24); + __m256i ymm0 = _mm256_castsi128_si256(xmm0); + ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1); + + __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 24)); + __m256i ymm1 = _mm256_castsi128_si256(xmm1); + ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1); + + __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 48)); + __m256i ymm2 = _mm256_castsi128_si256(xmm2); + ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1); + + __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 68)); + __m256i ymm3 = _mm256_castsi128_si256(xmm3); + ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1); + + ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); + ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); + ymm2 = _mm256_shuffle_epi8(ymm2, ymm_shuf); + ymm3 = _mm256_shuffle_epi8(ymm3, ymm_shuf2); + + ymm0 = _mm256_or_si256(ymm0, ymm_alpha); + ymm1 = _mm256_or_si256(ymm1, ymm_alpha); + ymm2 = _mm256_or_si256(ymm2, ymm_alpha); + ymm3 = _mm256_or_si256(ymm3, ymm_alpha); + + _mm256_storeu_si256((__m256i*)dst_argb, ymm0); + _mm256_storeu_si256((__m256i*)(dst_argb + 32), ymm1); + _mm256_storeu_si256((__m256i*)(dst_argb + 64), ymm2); + _mm256_storeu_si256((__m256i*)(dst_argb + 96), ymm3); + + src_rgb24 += 96; + dst_argb += 128; + width -= 32; + } + _mm256_zeroupper(); +} +#endif + #endif #ifdef __cplusplus