MergeUV AVX2 use vextractf128 to store results to avoid shuffling.

BUG=none
TESTED=intel sde on unittests
R=brucedawson@google.com

Review URL: https://webrtc-codereview.appspot.com/33369004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1178 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2014-11-22 03:33:33 +00:00
parent 147f7b70f5
commit ef14972df0
4 changed files with 10 additions and 13 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1177
Version: 1178
License: BSD
License File: LICENSE

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1177
#define LIBYUV_VERSION 1178
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -2446,8 +2446,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
}
#endif // HAS_SPLITUVROW_SSE2
// TODO(fbarchard): Consider vpunpcklbw, vpunpckhbw, store-low1, store-low2,
// extract-high1, extract-high2.
#ifdef HAS_MERGEUVROW_AVX2
void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
@ -2458,13 +2456,12 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
"lea " MEMLEA(0x20,0) ",%0 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vperm2i128 $0x20,%%ymm0,%%ymm2,%%ymm1 \n"
"vperm2i128 $0x31,%%ymm0,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm1," MEMACCESS(2) " \n"
"vmovdqu %%ymm2," MEMACCESS2(0x20,2) " \n"
"vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
"vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
"vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
"vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
"lea " MEMLEA(0x40,2) ",%2 \n"
"sub $0x20,%3 \n"
"jg 1b \n"

View File

@ -2686,10 +2686,10 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
lea eax, [eax + 32]
vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
vmovdqu [edi], ymm1
vmovdqu [edi + 32], ymm2
vextractf128 [edi], ymm2, 0 // bytes 0..15
vextractf128 [edi + 16], ymm0, 0 // bytes 16..31
vextractf128 [edi + 32], ymm2, 1 // bytes 32..47
vextractf128 [edi + 48], ymm0, 1 // bytes 47..63
lea edi, [edi + 64]
sub ecx, 32
jg convertloop