diff --git a/include/libyuv/row.h b/include/libyuv/row.h index be8cf6177..31e66b97b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -200,6 +200,7 @@ extern "C" { #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I444TOARGBROW_AVX2 #define HAS_I411TOARGBROW_AVX2 +#define HAS_I400TOARGBROW_AVX2 // TODO(fbarchard): Port to Neon #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_AVX2 @@ -935,9 +936,11 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I444ToARGBRow_C(const uint8* src_y, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 5ce2fd590..ce1d28bb6 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -349,6 +349,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I400ToARGBRow = I400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I400ToARGBRow = I400ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_I400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I400ToARGBRow = I400ToARGBRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 216c995bc..fb04a9dee 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -192,6 +192,9 @@ RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C, #if defined(HAS_I400TOARGBROW_SSE2) RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7) #endif +#if defined(HAS_I400TOARGBROW_AVX2) +RGBANY(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, I400ToARGBRow_C, 1, 4, 15) +#endif #if defined(HAS_YTOARGBROW_SSE2) RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7) #endif diff --git a/source/row_win.cc b/source/row_win.cc index a98728d34..f408cf125 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -284,6 +284,38 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } } +#ifdef HAS_I400TOARGBROW_AVX2 +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + vpunpckhwd ymm1, ymm0, ymm0 + vpunpcklwd ymm0, ymm0, ymm0 + vpor ymm0, ymm0, ymm5 + vpor ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_I400TOARGBROW_AVX2 + __declspec(naked) __declspec(align(16)) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { __asm {