diff --git a/README.chromium b/README.chromium index ac6bf7628..8fc27c722 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 568 +Version: 569 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4802bf604..64589a72a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -123,12 +123,20 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 +#define HAS_COPYROW_AVX2 // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 // TODO(fbarchard): Hook these up to all functions. e.g. format conversion. #define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_SPLITUVROW_AVX2 +#define HAS_MERGEUVROW_AVX2 +#define HAS_YUY2TOUV422ROW_AVX2 +#define HAS_YUY2TOUVROW_AVX2 +#define HAS_YUY2TOYROW_AVX2 +#define HAS_UYVYTOUV422ROW_AVX2 +#define HAS_UYVYTOUVROW_AVX2 +#define HAS_UYVYTOYROW_AVX2 #endif #endif @@ -375,7 +383,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, int width); void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); -void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); @@ -449,8 +456,6 @@ void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix); void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); -void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, @@ -570,8 +575,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); -void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix); void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, @@ -593,8 +596,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); -void MergeUVRow_Unaligned_AVX2(const uint8* src_u, const uint8* src_v, - uint8* dst_uv, int width); void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, @@ -603,6 +604,7 @@ void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_AVX2(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); void CopyRow_MIPS(const uint8* src, uint8* dst, int count); @@ -1154,6 +1156,11 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y, uint8* dst_argb, int width); +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -1175,6 +1182,11 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_C(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -1185,7 +1197,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); - +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); @@ -1197,6 +1213,11 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); @@ -1208,6 +1229,11 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_C(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 49030f107..89032ecc9 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 568 +#define LIBYUV_VERSION 569 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 770e37dc5..874c2c66c 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -290,25 +290,32 @@ int I400ToI420(const uint8* src_y, int src_stride_y, } static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, - uint8* dst, int dst_stride_frame, + uint8* dst, int dst_stride, int width, int height) { void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_X86) + if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; + } +#endif +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src, 16) && + IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif +#if defined(HAS_COPYROW_AVX2) + // TODO(fbarchard): Detect Fast String support. + if (TestCpuFlag(kCpuHasAVX2)) { + CopyRow = CopyRow_AVX2; + } +#endif #if defined(HAS_COPYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_NEON; } -#elif defined(HAS_COPYROW_X86) - if (IS_ALIGNED(width, 4)) { - CopyRow = CopyRow_X86; -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) && - IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && - IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) { - CopyRow = CopyRow_SSE2; - } -#endif - } #endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { @@ -319,9 +326,9 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // Copy plane for (int y = 0; y < height - 1; y += 2) { CopyRow(src, dst, width); - CopyRow(src + src_stride_0, dst + dst_stride_frame, width); + CopyRow(src + src_stride_0, dst + dst_stride, width); src += src_stride_0 + src_stride_1; - dst += dst_stride_frame * 2; + dst += dst_stride * 2; } if (height & 1) { CopyRow(src, dst, width); @@ -381,12 +388,7 @@ static int X420ToI420(const uint8* src_y, if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { SplitUVRow = SplitUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { - SplitUVRow = SplitUVRow_Unaligned_AVX2; - if (IS_ALIGNED(src_uv, 32) && IS_ALIGNED(src_stride_uv, 32) && - IS_ALIGNED(dst_u, 32) && IS_ALIGNED(dst_stride_u, 32) && - IS_ALIGNED(dst_v, 32) && IS_ALIGNED(dst_stride_v, 32)) { - SplitUVRow = SplitUVRow_AVX2; - } + SplitUVRow = SplitUVRow_AVX2; } } #endif @@ -413,8 +415,12 @@ static int X420ToI420(const uint8* src_y, #endif if (dst_y) { - CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, - width, height); + if (src_stride_y0 == src_stride_y1) { + CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height); + } else { + CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, + width, height); + } } int halfheight = (height + 1) >> 1; @@ -519,6 +525,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + CopyRow = CopyRow_AVX2; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; @@ -544,7 +555,20 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, } } } -#elif defined(HAS_YUY2TOYROW_NEON) +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + clear = true; + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { YUY2ToYRow = YUY2ToYRow_Any_NEON; if (width >= 16) { @@ -573,6 +597,11 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, CopyRow(src_y, dst_y, width); YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); } +#if defined(HAS_YUY2TOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } @@ -610,7 +639,20 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } } -#elif defined(HAS_YUY2TOYROW_NEON) +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + bool clear = true; + YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUVRow = YUY2ToUVRow_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { YUY2ToYRow = YUY2ToYRow_Any_NEON; if (width >= 16) { @@ -636,6 +678,12 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); YUY2ToYRow(src_yuy2, dst_y, width); } + +#if defined(HAS_YUY2TOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } @@ -673,7 +721,20 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } } } -#elif defined(HAS_UYVYTOYROW_NEON) +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + bool clear = true; + UYVYToUVRow = UYVYToUVRow_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUVRow = UYVYToUVRow_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { UYVYToYRow = UYVYToYRow_Any_NEON; if (width >= 16) { @@ -699,6 +760,12 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); UYVYToYRow(src_uyvy, dst_y, width); } + +#if defined(HAS_UYVYTOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } @@ -747,14 +814,8 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_AVX2; - ARGBToYRow = ARGBToYRow_Unaligned_AVX2; - if (IS_ALIGNED(src_argb, 32) && IS_ALIGNED(src_stride_argb, 32)) { - ARGBToUVRow = ARGBToUVRow_AVX2; - if (IS_ALIGNED(dst_y, 32) && IS_ALIGNED(dst_stride_y, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; } } #endif diff --git a/source/convert_from.cc b/source/convert_from.cc index c29903fee..5856459e9 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -451,15 +451,12 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MERGEUVROW_AVX2) + bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { + clear = true; MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_Unaligned_AVX2; - if (IS_ALIGNED(src_u, 32) && IS_ALIGNED(src_stride_u, 32) && - IS_ALIGNED(src_v, 32) && IS_ALIGNED(src_stride_v, 32) && - IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } + MergeUVRow_ = MergeUVRow_AVX2; } } #endif @@ -481,6 +478,12 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, src_v += src_stride_v; dst_uv += dst_stride_uv; } +#if defined(HAS_MERGEUVROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif + return 0; } diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index bcaab899a..2563728d4 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -183,7 +183,18 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, } } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + clear = true; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -206,6 +217,11 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, dst_u += dst_stride_u; dst_v += dst_stride_v; } +#if defined(HAS_ARGBTOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } @@ -277,10 +293,7 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_Unaligned_AVX2; - if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } + MergeUVRow_ = MergeUVRow_AVX2; } } #endif @@ -383,10 +396,7 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { - MergeUVRow_ = MergeUVRow_Unaligned_AVX2; - if (IS_ALIGNED(dst_uv, 32) && IS_ALIGNED(dst_stride_uv, 32)) { - MergeUVRow_ = MergeUVRow_AVX2; - } + MergeUVRow_ = MergeUVRow_AVX2; } } #endif @@ -624,7 +634,18 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } } } -#elif defined(HAS_ARGBTOYROW_NEON) +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + clear = true; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -638,6 +659,11 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_argb += src_stride_argb; dst_y += dst_stride_y; } +#if defined(HAS_ARGBTOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 31f52b6bb..642bb0929 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -34,11 +34,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y, } void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; -#if defined(HAS_COPYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { - CopyRow = CopyRow_NEON; - } -#endif #if defined(HAS_COPYROW_X86) if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { CopyRow = CopyRow_X86; @@ -51,6 +46,17 @@ void CopyPlane(const uint8* src_y, int src_stride_y, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_AVX2) + // TODO(fbarchard): Detect Fast String support. + if (TestCpuFlag(kCpuHasAVX2)) { + CopyRow = CopyRow_AVX2; + } +#endif +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { + CopyRow = CopyRow_NEON; + } +#endif #if defined(HAS_COPYROW_MIPS) if (TestCpuFlag(kCpuHasMIPS)) { CopyRow = CopyRow_MIPS; @@ -119,11 +125,11 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - MirrorRow = MirrorRow_SSSE3; - } + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + MirrorRow = MirrorRow_SSSE3; + } #endif // Mirror plane @@ -148,17 +154,15 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } void (*YUY2ToUV422Row)(const uint8* src_yuy2, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int pix); void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix); YUY2ToYRow = YUY2ToYRow_C; YUY2ToUV422Row = YUY2ToUV422Row_C; #if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - if (width > 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; - } + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; + YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; @@ -170,13 +174,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, } } } -#elif defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width > 8) { - YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width > 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - } +#endif +#if defined(HAS_YUY2TOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + bool clear = true; + YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; + YUY2ToYRow = YUY2ToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + YUY2ToUV422Row = YUY2ToUV422Row_AVX2; + YUY2ToYRow = YUY2ToYRow_AVX2; + } + } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width >= 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; } if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; @@ -193,6 +208,12 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, dst_u += dst_stride_u; dst_v += dst_stride_v; } + +#if defined(HAS_YUY2TOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } @@ -210,17 +231,15 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } void (*UYVYToUV422Row)(const uint8* src_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int pix); void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int pix); UYVYToYRow = UYVYToYRow_C; UYVYToUV422Row = UYVYToUV422Row_C; #if defined(HAS_UYVYTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - if (width > 16) { - UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; - } + if (TestCpuFlag(kCpuHasSSE2) && width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; UYVYToYRow = UYVYToYRow_Unaligned_SSE2; @@ -232,13 +251,24 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, } } } -#elif defined(HAS_UYVYTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width > 8) { - UYVYToYRow = UYVYToYRow_Any_NEON; - if (width > 16) { - UYVYToUV422Row = UYVYToUV422Row_Any_NEON; - } +#endif +#if defined(HAS_UYVYTOYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + bool clear = true; + UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; + UYVYToYRow = UYVYToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + UYVYToUV422Row = UYVYToUV422Row_AVX2; + UYVYToYRow = UYVYToYRow_AVX2; + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width >= 16) { + UYVYToUV422Row = UYVYToUV422Row_Any_NEON; } if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; @@ -255,6 +285,12 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, dst_u += dst_stride_u; dst_v += dst_stride_v; } + +#if defined(HAS_UYVYTOYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index cbc6cc012..ffbc8f399 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -196,7 +196,9 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, } #ifdef HAS_ARGBTOYROW_AVX2 -YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_Unaligned_AVX2, 4, 1, 32) +YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32) +YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) +YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) #endif #ifdef HAS_ARGBTOYROW_SSSE3 YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) @@ -266,7 +268,9 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, } #ifdef HAS_ARGBTOYROW_AVX2 -UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_Unaligned_AVX2, ARGBToUVRow_C, 4, 31) +UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31) +UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) +UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) #endif #ifdef HAS_ARGBTOUVROW_SSSE3 UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) @@ -306,6 +310,12 @@ UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3, ARGBToUV444Row_C, 4, 15, 0) #endif +#ifdef HAS_YUY2TOUV422ROW_AVX2 +UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, + YUY2ToUV422Row_C, 2, 31, 1) +UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, + UYVYToUV422Row_C, 2, 31, 1) +#endif #ifdef HAS_ARGBTOUVROW_SSSE3 UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, ARGBToUV422Row_C, 4, 15, 1) @@ -343,7 +353,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15) #endif #ifdef HAS_SPLITUVROW_AVX2 -SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31) +SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) #endif #ifdef HAS_SPLITUVROW_NEON SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) @@ -369,7 +379,7 @@ SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2, MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 -MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31) +MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) #endif #ifdef HAS_MERGEUVROW_NEON MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) diff --git a/source/row_win.cc b/source/row_win.cc index cbb5e6c05..642dbe452 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -776,10 +776,10 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { align 16 convertloop: - vmovdqa ymm0, [eax] - vmovdqa ymm1, [eax + 32] - vmovdqa ymm2, [eax + 64] - vmovdqa ymm3, [eax + 96] + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] vpmaddubsw ymm0, ymm0, ymm4 vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 @@ -793,7 +793,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vpaddb ymm0, ymm0, ymm5 sub ecx, 32 - vmovdqa [edx], ymm0 + vmovdqu [edx], ymm0 lea edx, [edx + 32] jg convertloop ret @@ -835,44 +835,6 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -#ifdef HAS_ARGBTOYROW_AVX2 -__declspec(naked) __declspec(align(32)) -void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { - __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_y */ - mov ecx, [esp + 12] /* pix */ - vmovdqa ymm6, kShufARGBToY_AVX - vmovdqa ymm5, kAddY16_AVX - vmovdqa ymm4, kARGBToY_AVX - - align 16 - convertloop: - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpmaddubsw ymm0, ymm0, ymm4 - vpmaddubsw ymm1, ymm1, ymm4 - vpmaddubsw ymm2, ymm2, ymm4 - vpmaddubsw ymm3, ymm3, ymm4 - lea eax, [eax + 128] - vphaddw ymm0, ymm0, ymm1 - vphaddw ymm2, ymm2, ymm3 - vpsrlw ymm0, ymm0, 7 - vpsrlw ymm2, ymm2, 7 - vpackuswb ymm0, ymm0, ymm2 - vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. - vpaddb ymm0, ymm0, ymm5 - sub ecx, 32 - vmovdqu [edx], ymm0 - lea edx, [edx + 32] - jg convertloop - ret - } -} -#endif // HAS_ARGBTOYROW_AVX2 - __declspec(naked) __declspec(align(16)) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -1162,11 +1124,11 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, align 16 convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - vmovdqa ymm0, [eax] - vmovdqa ymm1, [eax + 32] - vmovdqa ymm2, [eax + 64] - vmovdqa ymm3, [eax + 96] + /* step 1 - subsample 32x2 argb pixels to 16x1 */ + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] vpavgb ymm2, ymm2, [eax + esi + 64] @@ -1200,8 +1162,8 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, // step 3 - store 16 U and 16 V values sub ecx, 32 - vextractf128 qword ptr [edx], ymm0, 0 // U - vextractf128 qword ptr [edx + edi], ymm0, 1 // V + vextractf128 [edx], ymm0, 0 // U + vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] jg convertloop @@ -1282,75 +1244,6 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -#ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) __declspec(align(32)) -void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_argb - mov esi, [esp + 8 + 8] // src_stride_argb - mov edx, [esp + 8 + 12] // dst_u - mov edi, [esp + 8 + 16] // dst_v - mov ecx, [esp + 8 + 20] // pix - vmovdqa ymm7, kARGBToU_AVX - vmovdqa ymm6, kARGBToV_AVX - vmovdqa ymm5, kAddUV128_AVX - sub edi, edx // stride from u to v - - align 16 - convertloop: - /* step 1 - subsample 16x2 argb pixels to 8x1 */ - vmovdqu ymm0, [eax] - vmovdqu ymm1, [eax + 32] - vmovdqu ymm2, [eax + 64] - vmovdqu ymm3, [eax + 96] - vpavgb ymm0, ymm0, [eax + esi] - vpavgb ymm1, ymm1, [eax + esi + 32] - vpavgb ymm2, ymm2, [eax + esi + 64] - vpavgb ymm3, ymm3, [eax + esi + 96] - lea eax, [eax + 128] - vshufps ymm4, ymm0, ymm1, 0x88 - vshufps ymm0, ymm0, ymm1, 0xdd - vpavgb ymm0, ymm0, ymm4 - vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. - vshufps ymm4, ymm2, ymm3, 0x88 - vshufps ymm2, ymm2, ymm3, 0xdd - vpavgb ymm2, ymm2, ymm4 - vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove. - - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V - vpmaddubsw ymm1, ymm0, ymm7 // U - vpmaddubsw ymm3, ymm2, ymm7 - vpmaddubsw ymm0, ymm0, ymm6 // V - vpmaddubsw ymm2, ymm2, ymm6 - vphaddw ymm1, ymm1, ymm3 - vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove. - vphaddw ymm0, ymm0, ymm2 - vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. - vpsraw ymm1, ymm1, 8 - vpsraw ymm0, ymm0, 8 - vpacksswb ymm0, ymm1, ymm0 - vpermq ymm0, ymm0, 0xd8 - vpaddb ymm0, ymm0, ymm5 // -> unsigned - - // step 3 - store 16 U and 16 V values - sub ecx, 32 - vextractf128 qword ptr [edx], ymm0, 0 // U - vextractf128 qword ptr [edx + edi], ymm0, 1 // V - lea edx, [edx + 16] - jg convertloop - - pop edi - pop esi - ret - } -} -#endif // HAS_ARGBTOUVROW_AVX2 - __declspec(naked) __declspec(align(16)) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { @@ -2044,7 +1937,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Read 8 UV from 411. -#define READYUV444 __asm { \ +#define READYUV444 __asm { \ __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ __asm lea esi, [esi + 8] \ @@ -2052,7 +1945,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; } // Read 4 UV from 422, upsample to 8 UV. -#define READYUV422 __asm { \ +#define READYUV422 __asm { \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ @@ -2061,7 +1954,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; } // Read 2 UV from 411, upsample to 8 UV. -#define READYUV411 __asm { \ +#define READYUV411 __asm { \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 2] \ @@ -2071,14 +1964,14 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; } // Read 4 UV from NV12, upsample to 8 UV. -#define READNV12 __asm { \ +#define READNV12 __asm { \ __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ __asm lea esi, [esi + 8] \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ } // Convert 8 pixels: 8 UV and 8 Y. -#define YUVTORGB __asm { \ +#define YUVTORGB __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ @@ -2106,7 +1999,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; } // Convert 8 pixels: 8 VU and 8 Y. -#define YVUTORGB __asm { \ +#define YVUTORGB __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ @@ -3207,37 +3100,6 @@ void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } #endif // HAS_SPLITUVROW_SSE2 -#ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) __declspec(align(16)) -void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_u - mov edx, [esp + 4 + 8] // src_v - mov edi, [esp + 4 + 12] // dst_uv - mov ecx, [esp + 4 + 16] // width - sub edx, eax - - align 16 - convertloop: - movdqa xmm0, [eax] // read 16 U's - movdqa xmm1, [eax + edx] // and 16 V's - lea eax, [eax + 16] - movdqa xmm2, xmm0 - punpcklbw xmm0, xmm1 // first 8 UV pairs - punpckhbw xmm2, xmm1 // next 8 UV pairs - movdqa [edi], xmm0 - movdqa [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - ret - } -} - #ifdef HAS_SPLITUVROW_AVX2 __declspec(naked) __declspec(align(16)) void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { @@ -3251,43 +3113,6 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { vpsrlw ymm5, ymm5, 8 sub edi, edx - align 16 - convertloop: - vmovdqa ymm0, [eax] - vmovdqa ymm1, [eax + 32] - lea eax, [eax + 64] - vpsrlw ymm2, ymm0, 8 // odd bytes - vpsrlw ymm3, ymm1, 8 - vpand ymm0, ymm0, ymm5 // even bytes - vpand ymm1, ymm1, ymm5 - vpackuswb ymm0, ymm0, ymm1 - vpackuswb ymm2, ymm2, ymm3 - vpermq ymm0, ymm0, 0xd8 - vpermq ymm2, ymm2, 0xd8 - vmovdqa [edx], ymm0 - vmovdqa [edx + edi], ymm2 - lea edx, [edx + 32] - sub ecx, 32 - jg convertloop - - pop edi - ret - } -} - -__declspec(naked) __declspec(align(16)) -void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, - int pix) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src_uv - mov edx, [esp + 4 + 8] // dst_u - mov edi, [esp + 4 + 12] // dst_v - mov ecx, [esp + 4 + 16] // pix - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff - vpsrlw ymm5, ymm5, 8 - sub edi, edx - align 16 convertloop: vmovdqu ymm0, [eax] @@ -3313,6 +3138,37 @@ void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } #endif // HAS_SPLITUVROW_AVX2 +#ifdef HAS_MERGEUVROW_SSE2 +__declspec(naked) __declspec(align(16)) +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] // read 16 U's + movdqa xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqa [edi], xmm0 + movdqa [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + __declspec(naked) __declspec(align(16)) void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { @@ -3344,6 +3200,39 @@ void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, } #endif // HAS_MERGEUVROW_SSE2 +#ifdef HAS_MERGEUVROW_AVX2 +__declspec(naked) __declspec(align(16)) +void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 16 + convertloop: + vmovdqu ymm0, [eax] // read 32 U's + vmovdqu ymm1, [eax + edx] // and 32 V's + lea eax, [eax + 32] + vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 + vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 + vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + vmovdqu [edi], ymm1 + vmovdqu [edi + 32], ymm2 + lea edi, [edi + 64] + sub ecx, 32 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUVROW_AVX2 + #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. __declspec(naked) __declspec(align(16)) @@ -3368,6 +3257,24 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_SSE2 +#ifdef HAS_COPYROW_AVX2 +// Unaligned Multiple of 1. +__declspec(naked) __declspec(align(16)) +void CopyRow_AVX2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + rep movsb + mov edi, edx + mov esi, eax + ret + } +} +#endif // HAS_COPYROW_AVX2 + #ifdef HAS_COPYROW_X86 __declspec(naked) __declspec(align(16)) void CopyRow_X86(const uint8* src, uint8* dst, int count) { @@ -3434,6 +3341,226 @@ void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, } #endif // HAS_SETROW_X86 +#ifdef HAS_YUY2TOYROW_AVX2 +__declspec(naked) __declspec(align(16)) +void YUY2ToYRow_AVX2(const uint8* src_yuy2, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_yuy2 + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // even bytes are Y + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToYRow_AVX2(const uint8* src_uyvy, + uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] // src_uyvy + mov edx, [esp + 8] // dst_y + mov ecx, [esp + 12] // pix + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // odd bytes are Y + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_yuy2 + mov esi, [esp + 8 + 8] // stride_yuy2 + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_AVX2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 // UYVY -> UVUV + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 // mutates. + vpermq ymm0, ymm0, 0xd8 + vpand ymm1, ymm0, ymm5 // U + vpsrlw ymm0, ymm0, 8 // V + vpackuswb ymm1, ymm1, ymm1 // mutates. + vpackuswb ymm0, ymm0, ymm0 // mutates. + vpermq ymm1, ymm1, 0xd8 + vpermq ymm0, ymm0, 0xd8 + vextractf128 [edx], ymm1, 0 // U + vextractf128 [edx + edi], ymm0, 0 // V + lea edx, [edx + 16] + sub ecx, 32 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_YUY2TOYROW_AVX2 + #ifdef HAS_YUY2TOYROW_SSE2 __declspec(naked) __declspec(align(16)) void YUY2ToYRow_SSE2(const uint8* src_yuy2, diff --git a/source/row_x86.asm b/source/row_x86.asm index 8deb7f749..3a028c196 100644 --- a/source/row_x86.asm +++ b/source/row_x86.asm @@ -64,9 +64,7 @@ YUY2TOYROW UYVY,a, YUY2TOYROW UYVY,u,_Unaligned INIT_YMM AVX2 YUY2TOYROW YUY2,a, -YUY2TOYROW YUY2,u,_Unaligned YUY2TOYROW UYVY,a, -YUY2TOYROW UYVY,u,_Unaligned ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) @@ -107,7 +105,6 @@ SplitUVRow a, SplitUVRow u,_Unaligned INIT_YMM AVX2 SplitUVRow a, -SplitUVRow u,_Unaligned ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ; int width); @@ -121,11 +118,17 @@ cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix mov%1 m0, [src_uq] mov%1 m1, [src_vq] lea src_uq, [src_uq + mmsize] - mova m2, m0 - punpcklbw m0, m0, m1 // first 8 UV pairs - punpckhbw m2, m2, m1 // next 8 UV pairs - mov%1 [dst_uvq], m0 + punpcklbw m2, m0, m1 // first 8 UV pairs + punpckhbw m0, m0, m1 // next 8 UV pairs +%if cpuflag(AVX2) + vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 + vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 + mov%1 [dst_uvq], m1 mov%1 [dst_uvq + mmsize], m2 +%else + mov%1 [dst_uvq], m2 + mov%1 [dst_uvq + mmsize], m0 +%endif lea dst_uvq, [dst_uvq + mmsize * 2] sub pixd, mmsize jg .convertloop @@ -140,4 +143,4 @@ MergeUVRow_ a, MergeUVRow_ u,_Unaligned INIT_YMM AVX2 MergeUVRow_ a, -MergeUVRow_ u,_Unaligned +