diff --git a/README.chromium b/README.chromium index bddd20375..facea334a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1348 +Version: 1349 License: BSD License File: LICENSE diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 1fea6ce17..3652c8848 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -12,6 +12,7 @@ #define INCLUDE_LIBYUV_SCALE_ROW_H_ #include "libyuv/basic_types.h" +#include "libyuv/scale.h" #ifdef __cplusplus namespace libyuv { @@ -214,6 +215,10 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -242,6 +247,10 @@ void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 91e22b9d9..39d66a2e9 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1348 +#define LIBYUV_VERSION 1349 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index 1812ac139..3f2dc50b5 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -77,13 +77,15 @@ static void ScalePlaneDown2(int src_width, int src_height, } } #endif -// TODO(fbarchard): Do other filter modes. #if defined(HAS_SCALEROWDOWN2_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && - (filtering == kFilterBox || filtering == kFilterBilinear)) { - ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; + if (TestCpuFlag(kCpuHasAVX2)) { +// ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : +// (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : +// ScaleRowDown2Box_Any_AVX2); if (IS_ALIGNED(dst_width, 32)) { - ScaleRowDown2 = ScaleRowDown2Box_AVX2; + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : + ScaleRowDown2Box_AVX2); } } #endif diff --git a/source/scale_any.cc b/source/scale_any.cc index d62c76b7e..2b3ff4953 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -56,7 +56,10 @@ SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, 2, 1, 15) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 -SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2,ScaleRowDown2Box_C, +SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, 2, 1, 31) +SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, 2, 1, 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON diff --git a/source/scale_win.cc b/source/scale_win.cc index 99f83ab6d..c8a474bcb 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -199,6 +199,70 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } #ifdef HAS_SCALEROWDOWN2_AVX2 +// Reads 64 pixels, throws half away and writes 32 pixels. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x1 rectangle to 32x1. +__declspec(naked) __declspec(align(16)) +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + + vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels) + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpavgw ymm0, ymm0, ymm2 + vpavgw ymm1, ymm1, ymm3 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate + + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + // Blends 64x2 rectangle to 32x1. __declspec(naked) __declspec(align(16)) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, @@ -209,11 +273,8 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width - - vpcmpeqb ymm4, ymm4, ymm4 - vpsrlw ymm4, ymm4, 15 // '1' constant, 16b - vpackuswb ymm4, ymm4, ymm4 // '1' constant, 8b - vpxor ymm5, ymm5, ymm5 // constant 0 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 wloop: vmovdqu ymm0, [eax] @@ -222,12 +283,14 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] - vpmaddubsw ymm0, ymm0, ymm4 // add horizontally - vpmaddubsw ymm1, ymm1, ymm4 - vpavgw ymm0, ymm0, ymm5 // (x+1) >> 1 - vpavgw ymm1, ymm1, ymm5 + vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels) + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpavgw ymm0, ymm0, ymm2 + vpavgw ymm1, ymm1, ymm3 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vmovdqu [edx], ymm0 lea edx, [edx + 32]