Move vzeroupper to row functions to simplify caller and allow mix of avx2 and sse2. Impact reduced by row coalescing.

BUG=none
TEST=all tests pass with sde
Review URL: https://webrtc-codereview.appspot.com/1269009

git-svn-id: http://libyuv.googlecode.com/svn/trunk@641 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2013-04-04 05:54:59 +00:00
parent 91c50c3a7d
commit 9b4c00b908
11 changed files with 33 additions and 180 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 639 Version: 641
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 639 #define LIBYUV_VERSION 641
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -103,9 +103,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
} }
#endif #endif
#if defined(HAS_SUMSQUAREERROR_AVX2) #if defined(HAS_SUMSQUAREERROR_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
clear = true;
// Note only used for multiples of 32 so count is not checked. // Note only used for multiples of 32 so count is not checked.
SumSquareError = SumSquareError_AVX2; SumSquareError = SumSquareError_AVX2;
} }
@ -133,12 +131,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
if (remainder) { if (remainder) {
sse += SumSquareError_C(src_a, src_b, remainder); sse += SumSquareError_C(src_a, src_b, remainder);
} }
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return sse; return sse;
} }
@ -164,9 +156,7 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
} }
#endif #endif
#if defined(HAS_SUMSQUAREERROR_AVX2) #if defined(HAS_SUMSQUAREERROR_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
SumSquareError = SumSquareError_AVX2; SumSquareError = SumSquareError_AVX2;
} }
#endif #endif
@ -176,12 +166,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
src_a += stride_a; src_a += stride_a;
src_b += stride_b; src_b += stride_b;
} }
#if defined(HAS_SUMSQUAREERROR_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return sse; return sse;
} }

View File

@ -94,6 +94,7 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
vpermq ymm1, ymm0, 0x02 // high + low lane. vpermq ymm1, ymm0, 0x02 // high + low lane.
vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm1
vmovd eax, xmm0 vmovd eax, xmm0
vzeroupper
ret ret
} }
} }

View File

@ -99,9 +99,7 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_HALFROW_AVX2) #if defined(HAS_HALFROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(halfwidth, 32)) {
clear = true;
HalfRow = HalfRow_AVX2; HalfRow = HalfRow_AVX2;
} }
#endif #endif
@ -136,11 +134,6 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
if (height & 1) { if (height & 1) {
HalfRow(src_v, 0, dst_v, halfwidth); HalfRow(src_v, 0, dst_v, halfwidth);
} }
#if defined(HAS_HALFROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -583,9 +576,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_AVX2) #if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@ -623,11 +614,6 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
CopyRow(src_y, dst_y, width); CopyRow(src_y, dst_y, width);
YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
} }
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -667,9 +653,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_AVX2) #if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@ -704,12 +688,6 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
YUY2ToYRow(src_yuy2, dst_y, width); YUY2ToYRow(src_yuy2, dst_y, width);
} }
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -749,9 +727,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
} }
#endif #endif
#if defined(HAS_UYVYTOYROW_AVX2) #if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUVRow = UYVYToUVRow_Any_AVX2; UYVYToUVRow = UYVYToUVRow_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@ -786,12 +762,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
UYVYToYRow(src_uyvy, dst_y, width); UYVYToYRow(src_uyvy, dst_y, width);
} }
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -834,9 +804,7 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToUVRow = ARGBToUVRow_Any_AVX2; ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@ -873,12 +841,6 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb, dst_y, width);
} }
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }

View File

@ -483,9 +483,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_MERGEUVROW_AVX2) #if defined(HAS_MERGEUVROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) { if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
clear = true;
MergeUVRow_ = MergeUVRow_Any_AVX2; MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) { if (IS_ALIGNED(halfwidth, 32)) {
MergeUVRow_ = MergeUVRow_AVX2; MergeUVRow_ = MergeUVRow_AVX2;
@ -509,12 +507,6 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
src_v += src_stride_v; src_v += src_stride_v;
dst_uv += dst_stride_uv; dst_uv += dst_stride_uv;
} }
#if defined(HAS_MERGEUVROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }

View File

@ -218,9 +218,7 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
@ -250,11 +248,6 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -690,9 +683,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBTOYROW_AVX2) #if defined(HAS_ARGBTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
clear = true;
ARGBToYRow = ARGBToYRow_Any_AVX2; ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2; ARGBToYRow = ARGBToYRow_AVX2;
@ -713,11 +704,6 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_y += dst_stride_y; dst_y += dst_stride_y;
} }
#if defined(HAS_ARGBTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }

View File

@ -197,9 +197,7 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
} }
#endif #endif
#if defined(HAS_MIRRORROW_AVX2) #if defined(HAS_MIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
MirrorRow = MirrorRow_AVX2; MirrorRow = MirrorRow_AVX2;
} }
#endif #endif
@ -210,11 +208,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
src_y += src_stride_y; src_y += src_stride_y;
dst_y += dst_stride_y; dst_y += dst_stride_y;
} }
#if defined(HAS_MIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
} }
// Convert YUY2 to I422. // Convert YUY2 to I422.
@ -264,9 +257,7 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
} }
#endif #endif
#if defined(HAS_YUY2TOYROW_AVX2) #if defined(HAS_YUY2TOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@ -296,12 +287,6 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
#if defined(HAS_YUY2TOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -352,9 +337,7 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
} }
#endif #endif
#if defined(HAS_UYVYTOYROW_AVX2) #if defined(HAS_UYVYTOYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
bool clear = true;
UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;
UYVYToYRow = UYVYToYRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
@ -384,12 +367,6 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
dst_u += dst_stride_u; dst_u += dst_stride_u;
dst_v += dst_stride_v; dst_v += dst_stride_v;
} }
#if defined(HAS_UYVYTOYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -473,9 +450,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBMIRRORROW_AVX2) #if defined(HAS_ARGBMIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
clear = true;
ARGBMirrorRow = ARGBMirrorRow_AVX2; ARGBMirrorRow = ARGBMirrorRow_AVX2;
} }
#endif #endif
@ -491,12 +466,6 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -601,9 +570,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
} }
#endif #endif
#if defined(HAS_ARGBMULTIPLYROW_AVX2) #if defined(HAS_ARGBMULTIPLYROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBMultiplyRow = ARGBMultiplyRow_AVX2; ARGBMultiplyRow = ARGBMultiplyRow_AVX2;
@ -626,12 +593,6 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBMULTIPLYROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -674,9 +635,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
} }
#endif #endif
#if defined(HAS_ARGBADDROW_AVX2) #if defined(HAS_ARGBADDROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAddRow = ARGBAddRow_Any_AVX2; ARGBAddRow = ARGBAddRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBAddRow = ARGBAddRow_AVX2; ARGBAddRow = ARGBAddRow_AVX2;
@ -699,12 +658,6 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBADDROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -747,9 +700,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
} }
#endif #endif
#if defined(HAS_ARGBSUBTRACTROW_AVX2) #if defined(HAS_ARGBSUBTRACTROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBSubtractRow = ARGBSubtractRow_AVX2; ARGBSubtractRow = ARGBSubtractRow_AVX2;
@ -772,12 +723,6 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
src_argb1 += src_stride_argb1; src_argb1 += src_stride_argb1;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBSUBTRACTROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -1223,9 +1168,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBATTENUATEROW_AVX2) #if defined(HAS_ARGBATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBAttenuateRow = ARGBAttenuateRow_AVX2; ARGBAttenuateRow = ARGBAttenuateRow_AVX2;
@ -1246,13 +1189,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -1289,9 +1225,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
} }
#endif #endif
#if defined(HAS_ARGBUNATTENUATEROW_AVX2) #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {
clear = true;
ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
@ -1305,13 +1239,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
src_argb += src_stride_argb; src_argb += src_stride_argb;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBUNATTENUATEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }
@ -1791,9 +1718,7 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
} }
#endif #endif
#if defined(HAS_ARGBSHUFFLEROW_AVX2) #if defined(HAS_ARGBSHUFFLEROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
clear = true;
ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBShuffleRow = ARGBShuffleRow_AVX2; ARGBShuffleRow = ARGBShuffleRow_AVX2;
@ -1814,11 +1739,6 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
src_bgra += src_stride_bgra; src_bgra += src_stride_bgra;
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
} }
#if defined(HAS_ARGBSHUFFLEROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
return 0; return 0;
} }

View File

@ -882,9 +882,7 @@ void RotatePlane180(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_MIRRORROW_AVX2) #if defined(HAS_MIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
clear = true;
MirrorRow = MirrorRow_AVX2; MirrorRow = MirrorRow_AVX2;
} }
#endif #endif
@ -942,11 +940,6 @@ void RotatePlane180(const uint8* src, int src_stride,
src_bot -= src_stride; src_bot -= src_stride;
dst_bot -= dst_stride; dst_bot -= dst_stride;
} }
#if defined(HAS_MIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
} }
static void TransposeUVWx8_C(const uint8* src, int src_stride, static void TransposeUVWx8_C(const uint8* src, int src_stride,

View File

@ -101,9 +101,7 @@ void ARGBRotate180(const uint8* src, int src_stride,
} }
#endif #endif
#if defined(HAS_ARGBMIRRORROW_AVX2) #if defined(HAS_ARGBMIRRORROW_AVX2)
bool clear = false;
if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
clear = true;
ARGBMirrorRow = ARGBMirrorRow_AVX2; ARGBMirrorRow = ARGBMirrorRow_AVX2;
} }
#endif #endif
@ -159,11 +157,6 @@ void ARGBRotate180(const uint8* src, int src_stride,
src_bot -= src_stride; src_bot -= src_stride;
dst_bot -= dst_stride; dst_bot -= dst_stride;
} }
#if defined(HAS_ARGBMIRRORROW_AVX2)
if (clear) {
__asm vzeroupper;
}
#endif
} }
LIBYUV_API LIBYUV_API

View File

@ -763,6 +763,7 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
@ -1277,6 +1278,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -3133,6 +3135,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
@ -3254,6 +3257,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
@ -3367,6 +3371,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
@ -3462,6 +3467,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
@ -3599,6 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2,
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
@ -3643,6 +3650,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -3682,6 +3690,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
@ -3708,6 +3717,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
lea edx, [edx + 32] lea edx, [edx + 32]
jg convertloop jg convertloop
ret ret
vzeroupper
} }
} }
@ -3751,6 +3761,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -3790,6 +3801,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
@ -4633,6 +4645,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
lea eax, [eax + 32] lea eax, [eax + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
@ -4727,6 +4740,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
lea eax, [eax + 32] lea eax, [eax + 32]
jg convertloop jg convertloop
vzeroupper
ret ret
} }
} }
@ -4790,6 +4804,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
pop edi pop edi
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -5240,6 +5255,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop jg convertloop
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -5270,6 +5286,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop jg convertloop
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -5299,6 +5316,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
jg convertloop jg convertloop
pop esi pop esi
vzeroupper
ret ret
} }
} }
@ -6053,7 +6071,9 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
vmovdqu [eax + edi], ymm0 vmovdqu [eax + edi], ymm0
lea eax, [eax + 32] lea eax, [eax + 32]
jg convertloop jg convertloop
pop edi pop edi
vzeroupper
ret ret
} }
} }
@ -6162,6 +6182,8 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
vmovdqu [edx + 32], ymm1 vmovdqu [edx + 32], ymm1
lea edx, [edx + 64] lea edx, [edx + 64]
jg wloop jg wloop
vzeroupper
ret ret
} }
} }