From 7ff53f324c6084b7add14f58fe9cc68f05f3b365 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 1 Feb 2018 18:27:31 -0800 Subject: [PATCH] I422ToYUY2Row_AVX2 use vpmovzxbd instead of vpermq I422ToYUY2Row_AVX2 optimized from 7 cycles per 32 pixels to 6 cycles. Instead of 2 vpermq and vpunpcklbw: vmovdqu (%1),%%xmm2 vmovdqu 0x00(%1,%2,1),%%xmm3 lea 0x10(%1),%1 vpermq $0xd8,%%ymm2,%%ymm2 vpermq $0xd8,%%ymm3,%%ymm3 vpunpcklbw %%ymm3,%%ymm2,%%ymm2 ..use vpmovzxbd to expand the bytes to shorts, then vpslld and vpor vpmovzxbd (%1),%%ymm2 vpmovzxbd 0x00(%1,%2,1),%%ymm3 vpslld $0x10,%%ymm3,%%ymm3 vpor %%ymm3,%%ymm2,%%ymm2 which reduces the port 5 bottleneck by 1 cycle. Bug: libyuv:556 Test: out/Release/libyuv_unittest --gtest_filter=*I42?To*UY*Opt I422ToYUY2Row_AVX2 optimization Improve performance of AVX2 code by avoiding vpermq Bug: libyuv:556 Test: /usr/local/google/home/fbarchard/iaca-lin64/bin/iaca.sh -reduceout -arch BDW out/Release/obj/libyuv_internal/row_gcc.o Change-Id: Ie36732da23ecea1ffcc6b297bacc962780b59ef1 Reviewed-on: https://chromium-review.googlesource.com/898067 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- source/row_gcc.cc | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 85ef1319c..8cb62196b 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5978,7 +5978,6 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, asm volatile( "sub %1,%2 \n" - LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" @@ -6055,20 +6054,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, LABELALIGN "1: \n" - "vmovdqu (%1),%%xmm2 \n" - "vmovdqu 0x00(%1,%2,1),%%xmm3 \n" - "lea 0x10(%1),%1 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vpmovzxbw (%1),%%ymm2 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm3 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm3,%%ymm3 \n" + "vpor %%ymm3,%%ymm2,%%ymm2 \n" "vmovdqu (%0),%%ymm0 \n" - "lea 0x20(%0),%0 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%3) \n" - "vmovdqu %%ymm1,0x20(%3) \n" + "vextractf128 $0x0,%%ymm0,(%3) \n" + "vextractf128 $0x0,%%ymm1,0x10(%3) \n" + "vextractf128 $0x1,%%ymm0,0x20(%3) \n" + "vextractf128 $0x1,%%ymm1,0x30(%3) \n" "lea 0x40(%3),%3 \n" "sub $0x20,%4 \n" "jg 1b \n"