I422ToYUY2Row_AVX2 use vpmovzxbd instead of vpermq

I422ToYUY2Row_AVX2 optimized from 7 cycles per 32 pixels to 6 cycles. Instead of 2 vpermq and vpunpcklbw: vmovdqu (%1),%%xmm2 vmovdqu 0x00(%1,%2,1),%%xmm3 lea 0x10(%1),%1 vpermq $0xd8,%%ymm2,%%ymm2 vpermq $0xd8,%%ymm3,%%ymm3 vpunpcklbw %%ymm3,%%ymm2,%%ymm2 ..use vpmovzxbd to expand the bytes to shorts, then vpslld and vpor vpmovzxbd (%1),%%ymm2 vpmovzxbd 0x00(%1,%2,1),%%ymm3 vpslld $0x10,%%ymm3,%%ymm3 vpor %%ymm3,%%ymm2,%%ymm2 which reduces the port 5 bottleneck by 1 cycle. Bug: libyuv:556 Test: out/Release/libyuv_unittest --gtest_filter=*I42?To*UY*Opt I422ToYUY2Row_AVX2 optimization Improve performance of AVX2 code by avoiding vpermq Bug: libyuv:556 Test: /usr/local/google/home/fbarchard/iaca-lin64/bin/iaca.sh -reduceout -arch BDW out/Release/obj/libyuv_internal/row_gcc.o Change-Id: Ie36732da23ecea1ffcc6b297bacc962780b59ef1 Reviewed-on: https://chromium-review.googlesource.com/898067 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-07 17:26:49 +08:00 · 2018-02-01 18:27:31 -08:00 · 2018-02-01 18:27:31 -08:00 · 7ff53f324c
commit 7ff53f324c
parent 664c735677
1 changed files with 10 additions and 12 deletions
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -5978,7 +5978,6 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
  asm volatile(

      "sub       %1,%2                             \n"
-
      LABELALIGN
      "1:                                          \n"
      "movq      (%1),%%xmm2                       \n"
@ -6055,20 +6054,19 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,

      LABELALIGN
      "1:                                          \n"
-      "vmovdqu    (%1),%%xmm2                      \n"
-      "vmovdqu    0x00(%1,%2,1),%%xmm3             \n"
-      "lea        0x10(%1),%1                      \n"
-      "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-      "vpermq     $0xd8,%%ymm3,%%ymm3              \n"
-      "vpunpcklbw %%ymm3,%%ymm2,%%ymm2             \n"
+      "vpmovzxbw  (%1),%%ymm2                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm3             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm3,%%ymm3               \n"
+      "vpor       %%ymm3,%%ymm2,%%ymm2             \n"
      "vmovdqu    (%0),%%ymm0                      \n"
-      "lea        0x20(%0),%0                      \n"
-      "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-      "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
+      "add        $0x20,%0                         \n"
      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1             \n"
      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0             \n"
-      "vmovdqu    %%ymm0,(%3)                      \n"
-      "vmovdqu    %%ymm1,0x20(%3)                  \n"
+      "vextractf128 $0x0,%%ymm0,(%3)               \n"
+      "vextractf128 $0x0,%%ymm1,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm0,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x30(%3)           \n"
      "lea        0x40(%3),%3                      \n"
      "sub        $0x20,%4                         \n"
      "jg         1b                               \n"