From 9b4c00b908d37727c6caf82337813d567732be1c Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Thu, 4 Apr 2013 05:54:59 +0000
Subject: [PATCH] Move vzeroupper to row functions to simplify caller and allow
 mix of avx2 and sse2.  Impact reduced by row coalescing. BUG=none TEST=all
 tests pass with sde Review URL: https://webrtc-codereview.appspot.com/1269009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium             |  2 +-
 include/libyuv/version.h    |  2 +-
 source/compare.cc           | 16 --------
 source/compare_win.cc       |  1 +
 source/convert.cc           | 38 ------------------
 source/convert_from.cc      |  8 ----
 source/convert_from_argb.cc | 14 -------
 source/planar_functions.cc  | 80 -------------------------------------
 source/rotate.cc            |  7 ----
 source/rotate_argb.cc       |  7 ----
 source/row_win.cc           | 38 ++++++++++++++----
 11 files changed, 33 insertions(+), 180 deletions(-)

diff --git a/README.chromium b/README.chromium
index e8a06720e..816d70514 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 639
+Version: 641
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 9b349fa08..93d8adda0 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 639
+#define LIBYUV_VERSION 641
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/compare.cc b/source/compare.cc
index fde63c2f5..f8b358309 100644
--- a/source/compare.cc
+++ b/source/compare.cc
@@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
   }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2)) {
-    clear = true;
     // Note only used for multiples of 32 so count is not checked.
     SumSquareError = SumSquareError_AVX2;
   }
@@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
   if (remainder) {
     sse += SumSquareError_C(src_a, src_b, remainder);
   }
-
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return sse;
 }
 
@@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
   }
 #endif
 #if defined(HAS_SUMSQUAREERROR_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
     SumSquareError = SumSquareError_AVX2;
   }
 #endif
@@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
     src_a += stride_a;
     src_b += stride_b;
   }
-
-#if defined(HAS_SUMSQUAREERROR_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return sse;
 }
 
diff --git a/source/compare_win.cc b/source/compare_win.cc
index b8e74648c..b505917bb 100644
--- a/source/compare_win.cc
+++ b/source/compare_win.cc
@@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
     vpermq     ymm1, ymm0, 0x02  // high + low lane.
     vpaddd     ymm0, ymm0, ymm1
     vmovd      eax, xmm0
+    vzeroupper
     ret
   }
 }
diff --git a/source/convert.cc b/source/convert.cc
index 20b642b7c..446b87b09 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_HALFROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
-    clear = true;
     HalfRow = HalfRow_AVX2;
   }
 #endif
@@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
   if (height & 1) {
     HalfRow(src_v, 0, dst_v, halfwidth);
   }
-#if defined(HAS_HALFROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
     YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
     CopyRow(src_y, dst_y, width);
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
   }
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
     YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
     YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
     YUY2ToYRow(src_yuy2, dst_y, width);
   }
-
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
   }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
     UYVYToUVRow = UYVYToUVRow_Any_AVX2;
     UYVYToYRow = UYVYToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
     UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
     UYVYToYRow(src_uyvy, dst_y, width);
   }
-
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
     ARGBToUVRow = ARGBToUVRow_Any_AVX2;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
     ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
   }
-
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
diff --git a/source/convert_from.cc b/source/convert_from.cc
index b0de08549..93f8bfd86 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_MERGEUVROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
-    clear = true;
     MergeUVRow_ = MergeUVRow_Any_AVX2;
     if (IS_ALIGNED(halfwidth, 32)) {
       MergeUVRow_ = MergeUVRow_AVX2;
@@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
     src_v += src_stride_v;
     dst_uv += dst_stride_uv;
   }
-#if defined(HAS_MERGEUVROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
   return 0;
 }
 
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 7949c87c1..94a3086c2 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYRow = ARGBToYRow_AVX2;
@@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBTOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    clear = true;
     ARGBToYRow = ARGBToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       ARGBToYRow = ARGBToYRow_AVX2;
@@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     src_argb += src_stride_argb;
     dst_y += dst_stride_y;
   }
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index d88dc60c3..77af629a1 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
   }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
     MirrorRow = MirrorRow_AVX2;
   }
 #endif
@@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
     src_y += src_stride_y;
     dst_y += dst_stride_y;
   }
-#if defined(HAS_MIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }
 
 // Convert YUY2 to I422.
@@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
   }
 #endif
 #if defined(HAS_YUY2TOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
     YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
     YUY2ToYRow = YUY2ToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
-
-#if defined(HAS_YUY2TOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   }
 #endif
 #if defined(HAS_UYVYTOYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
-    bool clear = true;
     UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
     UYVYToYRow = UYVYToYRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
@@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     dst_u += dst_stride_u;
     dst_v += dst_stride_v;
   }
-
-#if defined(HAS_UYVYTOYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    clear = true;
     ARGBMirrorRow = ARGBMirrorRow_AVX2;
   }
 #endif
@@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
   }
-
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
     ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
-
-#if defined(HAS_ARGBMULTIPLYROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBADDROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
     ARGBAddRow = ARGBAddRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBAddRow = ARGBAddRow_AVX2;
@@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
-
-#if defined(HAS_ARGBADDROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
   }
 #endif
 #if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
     ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBSubtractRow = ARGBSubtractRow_AVX2;
@@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     src_argb1 += src_stride_argb1;
     dst_argb += dst_stride_argb;
   }
-
-#if defined(HAS_ARGBSUBTRACTROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
@@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
     ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
   }
-
-#if defined(HAS_ARGBATTENUATEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
   return 0;
 }
 
@@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
-    clear = true;
     ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
     if (IS_ALIGNED(width, 8)) {
       ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_argb += src_stride_argb;
     dst_argb += dst_stride_argb;
   }
-
-#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
-
   return 0;
 }
 
@@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
   }
 #endif
 #if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
-    clear = true;
     ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
       ARGBShuffleRow = ARGBShuffleRow_AVX2;
@@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     src_bgra += src_stride_bgra;
     dst_argb += dst_stride_argb;
   }
-#if defined(HAS_ARGBSHUFFLEROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
   return 0;
 }
 
diff --git a/source/rotate.cc b/source/rotate.cc
index b04493bfe..682737224 100644
--- a/source/rotate.cc
+++ b/source/rotate.cc
@@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride,
   }
 #endif
 #if defined(HAS_MIRRORROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
-    clear = true;
     MirrorRow = MirrorRow_AVX2;
   }
 #endif
@@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride,
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
-#if defined(HAS_MIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }
 
 static void TransposeUVWx8_C(const uint8* src, int src_stride,
diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc
index cccfb9b48..38536f05c 100644
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
   }
 #endif
 #if defined(HAS_ARGBMIRRORROW_AVX2)
-  bool clear = false;
   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
-    clear = true;
     ARGBMirrorRow = ARGBMirrorRow_AVX2;
   }
 #endif
@@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
     src_bot -= src_stride;
     dst_bot -= dst_stride;
   }
-#if defined(HAS_ARGBMIRRORROW_AVX2)
-  if (clear) {
-    __asm vzeroupper;
-  }
-#endif
 }
 
 LIBYUV_API
diff --git a/source/row_win.cc b/source/row_win.cc
index 2994c3634..7322d977e 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
     jg         convertloop
+    vzeroupper
     ret
   }
 }
@@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
     vmovdqu   [edx], ymm0
     lea       edx, [edx + 32]
     jg        convertloop
+    vzeroupper
     ret
   }
 }
@@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
     vmovdqu   [edx], ymm0
     lea       edx, [edx + 32]
     jg        convertloop
+    vzeroupper
     ret
   }
 }
@@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
     jg         convertloop
 
     pop        edi
+    vzeroupper
     ret
   }
 }
@@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
     jg         convertloop
 
     pop        edi
+    vzeroupper
     ret
   }
 }
@@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
     jg         convertloop
+    vzeroupper
     ret
   }
 }
@@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
     jg         convertloop
 
     pop        edi
+    vzeroupper
     ret
   }
 }
@@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
     lea        edx, [edx + 32]
     jg         convertloop
     ret
+    vzeroupper
   }
 }
 
@@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
     jg         convertloop
 
     pop        edi
+    vzeroupper
     ret
   }
 }
@@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
     lea        eax, [eax + 32]
     jg         convertloop
 
+    vzeroupper
     ret
   }
 }
@@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     lea        eax, [eax + 32]
     jg         convertloop
 
+    vzeroupper
     ret
   }
 }
@@ -4748,22 +4762,22 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     align      16
  convertloop:
     // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]  // alpha0
-    movzx      edi, byte ptr [eax + 7]  // alpha1
+    movzx      esi, byte ptr [eax + 3]                 // alpha0
+    movzx      edi, byte ptr [eax + 7]                 // alpha1
     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]
     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]  // alpha2
-    movzx      edi, byte ptr [eax + 15]  // alpha3
+    movzx      esi, byte ptr [eax + 11]                // alpha2
+    movzx      edi, byte ptr [eax + 15]                // alpha3
     vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]
     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]  // alpha4
-    movzx      edi, byte ptr [eax + 23]  // alpha5
+    movzx      esi, byte ptr [eax + 19]                // alpha4
+    movzx      edi, byte ptr [eax + 23]                // alpha5
     vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
     vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]
     vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]  // alpha6
-    movzx      edi, byte ptr [eax + 31]  // alpha7
+    movzx      esi, byte ptr [eax + 27]                // alpha6
+    movzx      edi, byte ptr [eax + 31]                // alpha7
     vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
     vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]
     vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]
@@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 
     pop        edi
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     jg         convertloop
 
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     jg         convertloop
 
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
     jg         convertloop
 
     pop        esi
+    vzeroupper
     ret
   }
 }
@@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
     vmovdqu    [eax + edi], ymm0
     lea        eax,  [eax + 32]
     jg         convertloop
+
     pop        edi
+    vzeroupper
     ret
   }
 }
@@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vmovdqu    [edx + 32], ymm1
     lea        edx, [edx + 64]
     jg         wloop
+
+    vzeroupper
     ret
   }
 }