diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 6c202834a..02912bde9 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -75,6 +75,7 @@ extern "C" { #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 +#define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 @@ -557,6 +558,7 @@ void ARGBToUV422Row_C(const uint8* src_argb, void ARGBToUV411Row_C(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index db8ad43b5..ef6f4edb8 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -195,6 +195,13 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, MirrorRow = MirrorRow_SSSE3; } #endif +#if defined(HAS_MIRRORROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + clear = true; + MirrorRow = MirrorRow_AVX2; + } +#endif // Mirror plane for (int y = 0; y < height; ++y) { @@ -202,6 +209,11 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, src_y += src_stride_y; dst_y += dst_stride_y; } +#if defined(HAS_MIRRORROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif } // Convert YUY2 to I422. diff --git a/source/rotate.cc b/source/rotate.cc index b8235f29a..b04493bfe 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -881,6 +881,13 @@ void RotatePlane180(const uint8* src, int src_stride, MirrorRow = MirrorRow_SSSE3; } #endif +#if defined(HAS_MIRRORROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + clear = true; + MirrorRow = MirrorRow_AVX2; + } +#endif #if defined(HAS_MIRRORROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && @@ -906,6 +913,12 @@ void RotatePlane180(const uint8* src, int src_stride, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX2) + // TODO(fbarchard): Detect Fast String support. + if (TestCpuFlag(kCpuHasAVX2)) { + CopyRow = CopyRow_AVX2; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; @@ -929,6 +942,11 @@ void RotatePlane180(const uint8* src, int src_stride, src_bot -= src_stride; dst_bot -= dst_stride; } +#if defined(HAS_MIRRORROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif } static void TransposeUVWx8_C(const uint8* src, int src_stride, diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 4a3eb1c4e..20406f5d5 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -122,6 +122,12 @@ void ARGBRotate180(const uint8* src, int src_stride, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX2) + // TODO(fbarchard): Detect Fast String support. + if (TestCpuFlag(kCpuHasAVX2)) { + CopyRow = CopyRow_AVX2; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; diff --git a/source/row_win.cc b/source/row_win.cc index ea773c721..272ac7320 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2905,7 +2905,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, #endif // HAS_YTOARGBROW_SSE2 #ifdef HAS_MIRRORROW_SSSE3 - // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u @@ -2933,6 +2932,36 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { } #endif // HAS_MIRRORROW_SSSE3 +#ifdef HAS_MIRRORROW_AVX2 +// Shuffle table for reversing the bytes. +static const ulvec8 kShuffleMirror_AVX2 = { + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u +}; + +__declspec(naked) __declspec(align(16)) +void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + vmovdqa ymm5, kShuffleMirror_AVX2 + lea eax, [eax - 32] + + align 16 + convertloop: + vmovdqu ymm0, [eax + ecx] + vpshufb ymm0, ymm0, ymm5 + vpermq ymm0, ymm0, 0x4e // swap high and low halfs + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_MIRRORROW_AVX2 + #ifdef HAS_MIRRORROW_SSE2 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 // version can not. @@ -3000,7 +3029,6 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif // HAS_MIRRORROW_UV_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSSE3 - // Shuffle table for reversing the bytes. static const uvec8 kARGBShuffleMirror = { 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u