From 9b4c00b908d37727c6caf82337813d567732be1c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 4 Apr 2013 05:54:59 +0000 Subject: [PATCH] Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2. Impact reduced by row coalescing. BUG=none TEST=all tests pass with sde Review URL: https://webrtc-codereview.appspot.com/1269009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/compare.cc | 16 -------- source/compare_win.cc | 1 + source/convert.cc | 38 ------------------ source/convert_from.cc | 8 ---- source/convert_from_argb.cc | 14 ------- source/planar_functions.cc | 80 ------------------------------------- source/rotate.cc | 7 ---- source/rotate_argb.cc | 7 ---- source/row_win.cc | 38 ++++++++++++++---- 11 files changed, 33 insertions(+), 180 deletions(-) diff --git a/README.chromium b/README.chromium index e8a06720e..816d70514 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 639 +Version: 641 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 9b349fa08..93d8adda0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 639 +#define LIBYUV_VERSION 641 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare.cc b/source/compare.cc index fde63c2f5..f8b358309 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2)) { - clear = true; // Note only used for multiples of 32 so count is not checked. SumSquareError = SumSquareError_AVX2; } @@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, if (remainder) { sse += SumSquareError_C(src_a, src_b, remainder); } - -#if defined(HAS_SUMSQUAREERROR_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return sse; } @@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { - clear = true; SumSquareError = SumSquareError_AVX2; } #endif @@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, src_a += stride_a; src_b += stride_b; } - -#if defined(HAS_SUMSQUAREERROR_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return sse; } diff --git a/source/compare_win.cc b/source/compare_win.cc index b8e74648c..b505917bb 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { vpermq ymm1, ymm0, 0x02 // high + low lane. vpaddd ymm0, ymm0, ymm1 vmovd eax, xmm0 + vzeroupper ret } } diff --git a/source/convert.cc b/source/convert.cc index 20b642b7c..446b87b09 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_HALFROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) { - clear = true; HalfRow = HalfRow_AVX2; } #endif @@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y, if (height & 1) { HalfRow(src_v, 0, dst_v, halfwidth); } -#if defined(HAS_HALFROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_YUY2TOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - clear = true; YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, CopyRow(src_y, dst_y, width); YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); } -#if defined(HAS_YUY2TOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } #endif #if defined(HAS_YUY2TOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - bool clear = true; YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); YUY2ToYRow(src_yuy2, dst_y, width); } - -#if defined(HAS_YUY2TOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } #endif #if defined(HAS_UYVYTOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - bool clear = true; UYVYToUVRow = UYVYToUVRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); UYVYToYRow(src_uyvy, dst_y, width); } - -#if defined(HAS_UYVYTOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - clear = true; ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); } - -#if defined(HAS_ARGBTOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } diff --git a/source/convert_from.cc b/source/convert_from.cc index b0de08549..93f8bfd86 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MERGEUVROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { - clear = true; MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; @@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, src_v += src_stride_v; dst_uv += dst_stride_uv; } -#if defined(HAS_MERGEUVROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif - return 0; } diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 7949c87c1..94a3086c2 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - clear = true; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; @@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, dst_u += dst_stride_u; dst_v += dst_stride_v; } -#if defined(HAS_ARGBTOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - clear = true; ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; @@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, src_argb += src_stride_argb; dst_y += dst_stride_y; } -#if defined(HAS_ARGBTOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d88dc60c3..77af629a1 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_MIRRORROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { - clear = true; MirrorRow = MirrorRow_AVX2; } #endif @@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, src_y += src_stride_y; dst_y += dst_stride_y; } -#if defined(HAS_MIRRORROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif } // Convert YUY2 to I422. @@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, } #endif #if defined(HAS_YUY2TOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - bool clear = true; YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, dst_u += dst_stride_u; dst_v += dst_stride_v; } - -#if defined(HAS_YUY2TOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, } #endif #if defined(HAS_UYVYTOYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - bool clear = true; UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, dst_u += dst_stride_u; dst_v += dst_stride_v; } - -#if defined(HAS_UYVYTOYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBMIRRORROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { - clear = true; ARGBMirrorRow = ARGBMirrorRow_AVX2; } #endif @@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, src_argb += src_stride_argb; dst_argb += dst_stride_argb; } - -#if defined(HAS_ARGBMIRRORROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBMULTIPLYROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { - clear = true; ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBMultiplyRow = ARGBMultiplyRow_AVX2; @@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } - -#if defined(HAS_ARGBMULTIPLYROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBADDROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { - clear = true; ARGBAddRow = ARGBAddRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAddRow = ARGBAddRow_AVX2; @@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } - -#if defined(HAS_ARGBADDROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, } #endif #if defined(HAS_ARGBSUBTRACTROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { - clear = true; ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBSubtractRow = ARGBSubtractRow_AVX2; @@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } - -#if defined(HAS_ARGBSUBTRACTROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } @@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { - clear = true; ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; @@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_argb += src_stride_argb; dst_argb += dst_stride_argb; } - -#if defined(HAS_ARGBATTENUATEROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif - return 0; } @@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBUNATTENUATEROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { - clear = true; ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; @@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_argb += src_stride_argb; dst_argb += dst_stride_argb; } - -#if defined(HAS_ARGBUNATTENUATEROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif - return 0; } @@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } #endif #if defined(HAS_ARGBSHUFFLEROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { - clear = true; ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGBShuffleRow = ARGBShuffleRow_AVX2; @@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, src_bgra += src_stride_bgra; dst_argb += dst_stride_argb; } -#if defined(HAS_ARGBSHUFFLEROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } diff --git a/source/rotate.cc b/source/rotate.cc index b04493bfe..682737224 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride, } #endif #if defined(HAS_MIRRORROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { - clear = true; MirrorRow = MirrorRow_AVX2; } #endif @@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride, src_bot -= src_stride; dst_bot -= dst_stride; } -#if defined(HAS_MIRRORROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif } static void TransposeUVWx8_C(const uint8* src, int src_stride, diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index cccfb9b48..38536f05c 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride, } #endif #if defined(HAS_ARGBMIRRORROW_AVX2) - bool clear = false; if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { - clear = true; ARGBMirrorRow = ARGBMirrorRow_AVX2; } #endif @@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride, src_bot -= src_stride; dst_bot -= dst_stride; } -#if defined(HAS_ARGBMIRRORROW_AVX2) - if (clear) { - __asm vzeroupper; - } -#endif } LIBYUV_API diff --git a/source/row_win.cc b/source/row_win.cc index 2994c3634..7322d977e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { vmovdqu [edx], ymm0 lea edx, [edx + 32] jg convertloop + vzeroupper ret } } @@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, pop edi pop esi + vzeroupper ret } } @@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { vmovdqu [edx], ymm0 lea edx, [edx + 32] jg convertloop + vzeroupper ret } } @@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { vmovdqu [edx], ymm0 lea edx, [edx + 32] jg convertloop + vzeroupper ret } } @@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { jg convertloop pop edi + vzeroupper ret } } @@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, jg convertloop pop edi + vzeroupper ret } } @@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, vmovdqu [edx], ymm0 lea edx, [edx + 32] jg convertloop + vzeroupper ret } } @@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, pop edi pop esi + vzeroupper ret } } @@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, jg convertloop pop edi + vzeroupper ret } } @@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, lea edx, [edx + 32] jg convertloop ret + vzeroupper } } @@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, pop edi pop esi + vzeroupper ret } } @@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, jg convertloop pop edi + vzeroupper ret } } @@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { lea eax, [eax + 32] jg convertloop + vzeroupper ret } } @@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, lea eax, [eax + 32] jg convertloop + vzeroupper ret } } @@ -4748,22 +4762,22 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, align 16 convertloop: // replace VPGATHER - movzx esi, byte ptr [eax + 3] // alpha0 - movzx edi, byte ptr [eax + 7] // alpha1 + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] - movzx esi, byte ptr [eax + 11] // alpha2 - movzx edi, byte ptr [eax + 15] // alpha3 + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] - movzx esi, byte ptr [eax + 19] // alpha4 - movzx edi, byte ptr [eax + 23] // alpha5 + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] - movzx esi, byte ptr [eax + 27] // alpha6 - movzx edi, byte ptr [eax + 31] // alpha7 + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] @@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, pop edi pop esi + vzeroupper ret } } @@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, jg convertloop pop esi + vzeroupper ret } } @@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, jg convertloop pop esi + vzeroupper ret } } @@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, jg convertloop pop esi + vzeroupper ret } } @@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, vmovdqu [eax + edi], ymm0 lea eax, [eax + 32] jg convertloop + pop edi + vzeroupper ret } } @@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] jg wloop + + vzeroupper ret } }