linear and point sample scale to half size for AVX2.

BUG=314
TESTED=out\release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=*.ScaleDownBy2*
R=tpsiaki@google.com

Review URL: https://webrtc-codereview.appspot.com/44959004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1349 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2015-03-30 21:46:08 +00:00
parent 9ef8999ff3
commit 72673ac873
6 changed files with 95 additions and 18 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1348 Version: 1349
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -12,6 +12,7 @@
#define INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/scale.h"
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
@ -214,6 +215,10 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
@ -242,6 +247,10 @@ void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width);
void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width); uint8* dst_ptr, int dst_width);
void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1348 #define LIBYUV_VERSION 1349
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -77,13 +77,15 @@ static void ScalePlaneDown2(int src_width, int src_height,
} }
} }
#endif #endif
// TODO(fbarchard): Do other filter modes.
#if defined(HAS_SCALEROWDOWN2_AVX2) #if defined(HAS_SCALEROWDOWN2_AVX2)
if (TestCpuFlag(kCpuHasAVX2) && if (TestCpuFlag(kCpuHasAVX2)) {
(filtering == kFilterBox || filtering == kFilterBilinear)) { // ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; // (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
// ScaleRowDown2Box_Any_AVX2);
if (IS_ALIGNED(dst_width, 32)) { if (IS_ALIGNED(dst_width, 32)) {
ScaleRowDown2 = ScaleRowDown2Box_AVX2; ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
(filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
ScaleRowDown2Box_AVX2);
} }
} }
#endif #endif

View File

@ -56,6 +56,9 @@ SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C,
2, 1, 15) 2, 1, 15)
#endif #endif
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
ScaleRowDown2Linear_C, 2, 1, 31)
SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
2, 1, 31) 2, 1, 31)
#endif #endif

View File

@ -199,6 +199,70 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
} }
#ifdef HAS_SCALEROWDOWN2_AVX2 #ifdef HAS_SCALEROWDOWN2_AVX2
// Reads 64 pixels, throws half away and writes 32 pixels.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride ignored
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm0, ymm0, 8 // isolate odd pixels.
vpsrlw ymm1, ymm1, 8
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x1 rectangle to 32x1.
__declspec(naked) __declspec(align(16))
void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
__asm {
mov eax, [esp + 4] // src_ptr
// src_stride
mov edx, [esp + 12] // dst_ptr
mov ecx, [esp + 16] // dst_width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpsrlw ymm5, ymm5, 8
wloop:
vmovdqu ymm0, [eax]
vmovdqu ymm1, [eax + 32]
lea eax, [eax + 64]
vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels)
vpsrlw ymm3, ymm1, 8
vpand ymm0, ymm0, ymm5
vpand ymm1, ymm1, ymm5
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate
vmovdqu [edx], ymm0
lea edx, [edx + 32]
sub ecx, 32
jg wloop
vzeroupper
ret
}
}
// Blends 64x2 rectangle to 32x1. // Blends 64x2 rectangle to 32x1.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
@ -209,11 +273,8 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
mov esi, [esp + 4 + 8] // src_stride mov esi, [esp + 4 + 8] // src_stride
mov edx, [esp + 4 + 12] // dst_ptr mov edx, [esp + 4 + 12] // dst_ptr
mov ecx, [esp + 4 + 16] // dst_width mov ecx, [esp + 4 + 16] // dst_width
vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
vpcmpeqb ymm4, ymm4, ymm4 vpsrlw ymm5, ymm5, 8
vpsrlw ymm4, ymm4, 15 // '1' constant, 16b
vpackuswb ymm4, ymm4, ymm4 // '1' constant, 8b
vpxor ymm5, ymm5, ymm5 // constant 0
wloop: wloop:
vmovdqu ymm0, [eax] vmovdqu ymm0, [eax]
@ -222,12 +283,14 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
vpavgb ymm1, ymm1, [eax + esi + 32] vpavgb ymm1, ymm1, [eax + esi + 32]
lea eax, [eax + 64] lea eax, [eax + 64]
vpmaddubsw ymm0, ymm0, ymm4 // add horizontally vpsrlw ymm2, ymm0, 8 // average columns (32 to 16 pixels)
vpmaddubsw ymm1, ymm1, ymm4 vpsrlw ymm3, ymm1, 8
vpavgw ymm0, ymm0, ymm5 // (x+1) >> 1 vpand ymm0, ymm0, ymm5
vpavgw ymm1, ymm1, ymm5 vpand ymm1, ymm1, ymm5
vpavgw ymm0, ymm0, ymm2
vpavgw ymm1, ymm1, ymm3
vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpermq ymm0, ymm0, 0xd8 // unmutate
vmovdqu [edx], ymm0 vmovdqu [edx], ymm0
lea edx, [edx + 32] lea edx, [edx + 32]