mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
I422ToYUY2Row_AVX2 use vpmovzxbd instead of vpermq
I422ToYUY2Row_AVX2 optimized from 7 cycles per 32 pixels to 6 cycles. Instead of 2 vpermq and vpunpcklbw: vmovdqu (%1),%%xmm2 vmovdqu 0x00(%1,%2,1),%%xmm3 lea 0x10(%1),%1 vpermq $0xd8,%%ymm2,%%ymm2 vpermq $0xd8,%%ymm3,%%ymm3 vpunpcklbw %%ymm3,%%ymm2,%%ymm2 ..use vpmovzxbd to expand the bytes to shorts, then vpslld and vpor vpmovzxbd (%1),%%ymm2 vpmovzxbd 0x00(%1,%2,1),%%ymm3 vpslld $0x10,%%ymm3,%%ymm3 vpor %%ymm3,%%ymm2,%%ymm2 which reduces the port 5 bottleneck by 1 cycle. Bug: libyuv:556 Test: out/Release/libyuv_unittest --gtest_filter=*I42?To*UY*Opt I422ToYUY2Row_AVX2 optimization Improve performance of AVX2 code by avoiding vpermq Bug: libyuv:556 Test: /usr/local/google/home/fbarchard/iaca-lin64/bin/iaca.sh -reduceout -arch BDW out/Release/obj/libyuv_internal/row_gcc.o Change-Id: Ie36732da23ecea1ffcc6b297bacc962780b59ef1 Reviewed-on: https://chromium-review.googlesource.com/898067 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
664c735677
commit
7ff53f324c
@ -5978,7 +5978,6 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
|
||||
asm volatile(
|
||||
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%1),%%xmm2 \n"
|
||||
@ -6055,20 +6054,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%1),%%xmm2 \n"
|
||||
"vmovdqu 0x00(%1,%2,1),%%xmm3 \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
|
||||
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
|
||||
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
|
||||
"vpmovzxbw (%1),%%ymm2 \n"
|
||||
"vpmovzxbw 0x00(%1,%2,1),%%ymm3 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"vpsllw $0x8,%%ymm3,%%ymm3 \n"
|
||||
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"vpermq $0xd8,%%ymm2,%%ymm2 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
|
||||
"vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%3) \n"
|
||||
"vmovdqu %%ymm1,0x20(%3) \n"
|
||||
"vextractf128 $0x0,%%ymm0,(%3) \n"
|
||||
"vextractf128 $0x0,%%ymm1,0x10(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm0,0x20(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm1,0x30(%3) \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"sub $0x20,%4 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user