diff --git a/README.chromium b/README.chromium index 64d025562..dddf0ae85 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 587 +Version: 588 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f051b3def..b7cbd6e29 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -141,6 +141,9 @@ extern "C" { // Effects #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 +#define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBADDROW_AVX2 +#define HAS_ARGBSUBTRACTROW_AVX2 #endif #endif @@ -1011,6 +1014,10 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, @@ -1023,6 +1030,10 @@ void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, @@ -1036,6 +1047,10 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); +void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); +void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1, + uint8* dst_argb, int width); void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d4755a768..9907e217e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 587 +#define LIBYUV_VERSION 588 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 357b081a5..db8ad43b5 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -28,6 +28,7 @@ LIBYUV_API void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height) { + // Coalesce contiguous rows. if (src_stride_y == width && dst_stride_y == width) { CopyPlane(src_y, 0, dst_y, 0, width * height, 1); return; @@ -503,7 +504,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0, return 0; } -// Multiply 2 ARGB images together and store to destination. +// Multiply 2 ARGB images and store to destination. LIBYUV_API int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, @@ -518,6 +519,15 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + return ARGBMultiply(src_argb0, 0, + src_argb1, 0, + dst_argb, 0, + width * height, 1); + } void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBMultiplyRow_C; @@ -531,7 +541,18 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, ARGBMultiplyRow = ARGBMultiplyRow_SSE2; } } -#elif defined(HAS_ARGBMULTIPLYROW_NEON) +#endif +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + clear = true; + ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBMultiplyRow = ARGBMultiplyRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBMULTIPLYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -547,10 +568,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0, src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } + +#if defined(HAS_ARGBMULTIPLYROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } -// Add 2 ARGB images together and store to destination. +// Add 2 ARGB images and store to destination. LIBYUV_API int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, @@ -565,6 +592,15 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + return ARGBAdd(src_argb0, 0, + src_argb1, 0, + dst_argb, 0, + width * height, 1); + } void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBAddRow_C; @@ -578,7 +614,18 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, ARGBAddRow = ARGBAddRow_SSE2; } } -#elif defined(HAS_ARGBADDROW_NEON) +#endif +#if defined(HAS_ARGBADDROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + clear = true; + ARGBAddRow = ARGBAddRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAddRow = ARGBAddRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBADDROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBAddRow = ARGBAddRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -594,6 +641,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } + +#if defined(HAS_ARGBADDROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } @@ -612,6 +665,15 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_argb0 == width * 4 && + src_stride_argb1 == width * 4 && + dst_stride_argb == width * 4) { + return ARGBSubtract(src_argb0, 0, + src_argb1, 0, + dst_argb, 0, + width * height, 1); + } void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst, int width) = ARGBSubtractRow_C; @@ -625,7 +687,18 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, ARGBSubtractRow = ARGBSubtractRow_SSE2; } } -#elif defined(HAS_ARGBSUBTRACTROW_NEON) +#endif +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && width >= 8) { + clear = true; + ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBSubtractRow = ARGBSubtractRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSUBTRACTROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBSubtractRow = ARGBSubtractRow_Any_NEON; if (IS_ALIGNED(width, 8)) { @@ -641,6 +714,12 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0, src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } + +#if defined(HAS_ARGBSUBTRACTROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index 6b434f384..723a56652 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -420,6 +420,17 @@ MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C, 3) #endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C, + 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C, + 7) +#endif #ifdef HAS_ARGBMULTIPLYROW_NEON MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, 7) diff --git a/source/row_common.cc b/source/row_common.cc index 61bf5e269..ee5c4b9e7 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -751,7 +751,7 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1, } #undef SHADE -#define SHADE(f, v) (v >= f) ? 0 : (f - v) +#define SHADE(f, v) ((f - v) > f) ? 0 : (f - v) void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { diff --git a/source/row_win.cc b/source/row_win.cc index bb070d06d..a05828c5e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4915,6 +4915,102 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBSUBTRACTROW_SSE2 +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +// Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + sub esi, eax + sub edx, eax + + align 16 + convertloop: + vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 + vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1 + vpunpcklbw ymm0, ymm1, ymm1 // low 4 + vpunpckhbw ymm1, ymm1, ymm1 // high 4 + vpunpcklbw ymm2, ymm3, ymm5 // low 4 + vpunpckhbw ymm3, ymm3, ymm5 // high 4 + vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 + vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 + vpackuswb ymm0, ymm0, ymm1 + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBMULTIPLYROW_AVX2 + +#ifdef HAS_ARGBADDROW_AVX2 +// Add 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + vpxor ymm5, ymm5, ymm5 // constant 0 + sub esi, eax + sub edx, eax + + align 16 + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1 + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBADDROW_AVX2 + +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +// Subtract 2 rows of ARGB pixels together, 8 pixels at a time. +__declspec(naked) __declspec(align(16)) +void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, + uint8* dst_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_argb0 + mov esi, [esp + 4 + 8] // src_argb1 + mov edx, [esp + 4 + 12] // dst_argb + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + + align 16 + convertloop: + vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 + vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1 + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_ARGBSUBTRACTROW_AVX2 + #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed.