Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2. Impact reduced by row coalescing.

BUG=none TEST=all tests pass with sde Review URL: https://webrtc-codereview.appspot.com/1269009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2013-04-04 05:54:59 +00:00 · 2013-04-04 05:54:59 +00:00 · 9b4c00b908
commit 9b4c00b908
parent 91c50c3a7d
11 changed files with 33 additions and 180 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 639
+Version: 641
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 639
+#define LIBYUV_VERSION 641

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/compare.cc
+++ b/source/compare.cc
@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2)) {
-    clear = true;
    // Note only used for multiples of 32 so count is not checked.
    SumSquareError = SumSquareError_AVX2;
  }
@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
  if (remainder) {
    sse += SumSquareError_C(src_a, src_b, remainder);
  }
-
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return sse;
 }

@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
  }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
    SumSquareError = SumSquareError_AVX2;
  }
 #endif
@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
    src_a += stride_a;
    src_b += stride_b;
  }
-
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return sse;
 }

--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    vpermq     ymm1, ymm0, 0x02  // high + low lane.
    vpaddd     ymm0, ymm0, ymm1
    vmovd      eax, xmm0
+    vzeroupper
    ret
  }
 }
--- a/source/convert.cc
+++ b/source/convert.cc
@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_HALFROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
-    clear = true;
    HalfRow = HalfRow_AVX2;
  }
 #endif
@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
  if (height & 1) {
    HalfRow(src_v, 0, dst_v, halfwidth);
  }
-#if defined(HAS_HALFROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
    CopyRow(src_y, dst_y, width);
    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
  }
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
  }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
    YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
    YUY2ToYRow(src_yuy2, dst_y, width);
  }
-
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
  }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    UYVYToUVRow = UYVYToUVRow_Any_AVX2;
    UYVYToYRow = UYVYToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
    UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
    UYVYToYRow(src_uyvy, dst_y, width);
  }
-
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    ARGBToUVRow = ARGBToUVRow_Any_AVX2;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
    ARGBToYRow(src_argb, dst_y, width);
  }
-
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
-    clear = true;
    MergeUVRow_ = MergeUVRow_Any_AVX2;
    if (IS_ALIGNED(halfwidth, 32)) {
      MergeUVRow_ = MergeUVRow_AVX2;
@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
    src_v += src_stride_v;
    dst_uv += dst_stride_uv;
  }
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
  return 0;
 }

--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
    ARGBToYRow = ARGBToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYRow = ARGBToYRow_AVX2;
@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_y += dst_stride_y;
  }
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
  }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
    MirrorRow = MirrorRow_AVX2;
  }
 #endif
@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
    src_y += src_stride_y;
    dst_y += dst_stride_y;
  }
-#if defined(HAS_MIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }

 // Convert YUY2 to I422.
@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
  }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
-
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
  }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
    UYVYToYRow = UYVYToYRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
    dst_u += dst_stride_u;
    dst_v += dst_stride_v;
  }
-
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    clear = true;
    ARGBMirrorRow = ARGBMirrorRow_AVX2;
  }
 #endif
@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
  }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
  }
 #endif
 #if defined(HAS_ARGBADDROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBAddRow = ARGBAddRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBAddRow = ARGBAddRow_AVX2;
@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBADDROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
  }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBSubtractRow = ARGBSubtractRow_AVX2;
@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
  return 0;
 }

@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
  }
 #endif
 #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
    if (IS_ALIGNED(width, 8)) {
      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
    src_argb += src_stride_argb;
    dst_argb += dst_stride_argb;
  }
-
-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
  return 0;
 }

@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
  }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
-    clear = true;
    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      ARGBShuffleRow = ARGBShuffleRow_AVX2;
@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
    src_bgra += src_stride_bgra;
    dst_argb += dst_stride_argb;
  }
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
  return 0;
 }

--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride,
  }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
    MirrorRow = MirrorRow_AVX2;
  }
 #endif
@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride,
    src_bot -= src_stride;
    dst_bot -= dst_stride;
  }
-#if defined(HAS_MIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }

 static void TransposeUVWx8_C(const uint8* src, int src_stride,
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
  }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    clear = true;
    ARGBMirrorRow = ARGBMirrorRow_AVX2;
  }
 #endif
@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
    src_bot -= src_stride;
    dst_bot -= dst_stride;
  }
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }

 LIBYUV_API
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
    vmovdqu    [edx], ymm0
    lea        edx, [edx + 32]
    jg         convertloop
+    vzeroupper
    ret
  }
 }
@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
    vmovdqu   [edx], ymm0
    lea       edx, [edx + 32]
    jg        convertloop
+    vzeroupper
    ret
  }
 }
@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
    vmovdqu   [edx], ymm0
    lea       edx, [edx + 32]
    jg        convertloop
+    vzeroupper
    ret
  }
 }
@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
    vmovdqu    [edx], ymm0
    lea        edx, [edx + 32]
    jg         convertloop
+    vzeroupper
    ret
  }
 }
@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
    lea        edx, [edx + 32]
    jg         convertloop
    ret
+    vzeroupper
  }
 }

@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
    jg         convertloop

    pop        edi
+    vzeroupper
    ret
  }
 }
@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
    lea        eax, [eax + 32]
    jg         convertloop

+    vzeroupper
    ret
  }
 }
@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
    lea        eax, [eax + 32]
    jg         convertloop

+    vzeroupper
    ret
  }
 }
@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

    pop        edi
    pop        esi
+    vzeroupper
    ret
  }
 }
@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
    jg         convertloop

    pop        esi
+    vzeroupper
    ret
  }
 }
@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
    jg         convertloop

    pop        esi
+    vzeroupper
    ret
  }
 }
@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
    jg         convertloop

    pop        esi
+    vzeroupper
    ret
  }
 }
@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
    vmovdqu    [eax + edi], ymm0
    lea        eax,  [eax + 32]
    jg         convertloop
+
    pop        edi
+    vzeroupper
    ret
  }
 }
@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
    vmovdqu    [edx + 32], ymm1
    lea        edx, [edx + 64]
    jg         wloop
+
+    vzeroupper
    ret
  }
 }