From 7ff53f324c6084b7add14f58fe9cc68f05f3b365 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@chromium.org>
Date: Thu, 1 Feb 2018 18:27:31 -0800
Subject: [PATCH] I422ToYUY2Row_AVX2 use vpmovzxbd instead of vpermq

I422ToYUY2Row_AVX2 optimized from 7 cycles per 32 pixels to 6 cycles.
Instead of 2 vpermq and vpunpcklbw:
vmovdqu    (%1),%%xmm2
vmovdqu    0x00(%1,%2,1),%%xmm3
lea        0x10(%1),%1
vpermq     $0xd8,%%ymm2,%%ymm2
vpermq     $0xd8,%%ymm3,%%ymm3
vpunpcklbw %%ymm3,%%ymm2,%%ymm2

..use vpmovzxbd to expand the bytes to shorts, then vpslld and vpor
vpmovzxbd  (%1),%%ymm2
vpmovzxbd  0x00(%1,%2,1),%%ymm3
vpslld     $0x10,%%ymm3,%%ymm3
vpor       %%ymm3,%%ymm2,%%ymm2
which reduces the port 5 bottleneck by 1 cycle.

Bug: libyuv:556
Test: out/Release/libyuv_unittest --gtest_filter=*I42?To*UY*Opt

I422ToYUY2Row_AVX2 optimization

Improve performance of AVX2 code by avoiding vpermq

Bug: libyuv:556
Test: /usr/local/google/home/fbarchard/iaca-lin64/bin/iaca.sh -reduceout -arch BDW out/Release/obj/libyuv_internal/row_gcc.o
Change-Id: Ie36732da23ecea1ffcc6b297bacc962780b59ef1
Reviewed-on: https://chromium-review.googlesource.com/898067
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
---
 source/row_gcc.cc | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 85ef1319c..8cb62196b 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5978,7 +5978,6 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
   asm volatile(
 
       "sub       %1,%2                             \n"
-
       LABELALIGN
       "1:                                          \n"
       "movq      (%1),%%xmm2                       \n"
@@ -6055,20 +6054,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
 
       LABELALIGN
       "1:                                          \n"
-      "vmovdqu    (%1),%%xmm2                      \n"
-      "vmovdqu    0x00(%1,%2,1),%%xmm3             \n"
-      "lea        0x10(%1),%1                      \n"
-      "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-      "vpermq     $0xd8,%%ymm3,%%ymm3              \n"
-      "vpunpcklbw %%ymm3,%%ymm2,%%ymm2             \n"
+      "vpmovzxbw  (%1),%%ymm2                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm3             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm3,%%ymm3               \n"
+      "vpor       %%ymm3,%%ymm2,%%ymm2             \n"
       "vmovdqu    (%0),%%ymm0                      \n"
-      "lea        0x20(%0),%0                      \n"
-      "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+      "add        $0x20,%0                         \n"
       "vpunpckhbw %%ymm2,%%ymm0,%%ymm1             \n"
       "vpunpcklbw %%ymm2,%%ymm0,%%ymm0             \n"
-      "vmovdqu    %%ymm0,(%3)                      \n"
-      "vmovdqu    %%ymm1,0x20(%3)                  \n"
+      "vextractf128 $0x0,%%ymm0,(%3)               \n"
+      "vextractf128 $0x0,%%ymm1,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm0,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x30(%3)           \n"
       "lea        0x40(%3),%3                      \n"
       "sub        $0x20,%4                         \n"
       "jg         1b                               \n"