diff --git a/README.chromium b/README.chromium index ac756e3c5..70959eb39 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1346 +Version: 1347 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 07033fe72..1fea6ce17 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -23,6 +23,13 @@ extern "C" { #define LIBYUV_DISABLE_X86 #endif +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + + // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) @@ -42,6 +49,11 @@ extern "C" { #define HAS_FIXEDDIV1_X86 #endif +// The following are available on VS2012. +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_SCALEROWDOWN2_AVX2 +#endif + // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) @@ -202,6 +214,8 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -228,6 +242,8 @@ void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 09f67755a..19f3bc1b3 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1346 +#define LIBYUV_VERSION 1347 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 3c7f761a4..b8314c70c 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -528,7 +528,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, return 0; } -// Get a blender that optimized for the CPU, alignment and pixel count. +// Get a blender that optimized for the CPU and pixel count. // As there are 6 blenders to choose from, the caller should try to use // the same blend function for all pixels if possible. LIBYUV_API diff --git a/source/scale.cc b/source/scale.cc index 15056732a..1812ac139 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -77,6 +77,16 @@ static void ScalePlaneDown2(int src_width, int src_height, } } #endif +// TODO(fbarchard): Do other filter modes. +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && + (filtering == kFilterBox || filtering == kFilterBilinear)) { + ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = ScaleRowDown2Box_AVX2; + } + } +#endif #if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && diff --git a/source/scale_any.cc b/source/scale_any.cc index 8adfbd3e1..d62c76b7e 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -52,8 +52,12 @@ CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, ScaleRowDown2Linear_C, 2, 1, 15) -SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, - ScaleRowDown2Box_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, + 2, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN2_AVX2 +SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2,ScaleRowDown2Box_C, + 2, 1, 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) diff --git a/source/scale_win.cc b/source/scale_win.cc index 770828c77..6474880cd 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -94,7 +94,6 @@ static uvec16 kScaleAb2 = { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; // Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -121,7 +120,6 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -158,7 +156,6 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -200,8 +197,47 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } +// Blends 64x2 rectangle to 32x1. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] // average rows + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + + vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels) + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpavgw ymm0, ymm0, ymm2 + vpavgw ymm1, ymm1, ymm3 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate + + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + pop esi + vzeroupper + ret + } +} + // Point samples 32 pixels to 8 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -233,7 +269,6 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x4 rectangle to 8x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -297,7 +332,6 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Then shuffled to do the scaling. // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -345,7 +379,6 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, @@ -403,7 +436,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, @@ -751,7 +783,6 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { @@ -777,7 +808,6 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, @@ -803,7 +833,6 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, @@ -832,7 +861,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, @@ -867,7 +895,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -904,7 +931,6 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, } // Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, @@ -1115,7 +1141,6 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) {