From a5e93766a20ac5fff9e0ef2f5bc7c4bb1a0fdb8e Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Thu, 13 Oct 2016 16:03:43 -0700 Subject: [PATCH] Add ARGBExtractAlpha_AVX2 function Port SSE2 version to AVX2. BUG=libyuv:572 TEST=/usr/local/google/home/fbarchard/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=*Extract* R=wangcheng@google.com, magjed@chromium.org Review URL: https://codereview.chromium.org/2420553002 . --- README.chromium | 2 +- include/libyuv/row.h | 12 +++++++++++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 6 ++++++ source/row_any.cc | 3 +++ source/row_gcc.cc | 41 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 64 insertions(+), 2 deletions(-) diff --git a/README.chromium b/README.chromium index 263b7acb0..acffcfd86 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1625 +Version: 1626 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index fc89de3d6..608664bb0 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -239,6 +239,15 @@ extern "C" { #define HAS_BLENDPLANEROW_AVX2 #endif + +// The following are available clang 3.4 or gcc 4.7. +// TODO(fbarchard): Port to Visual C +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \ + !(defined(__clang__) && defined(_M_IX86) ) +#define HAS_ARGBEXTRACTALPHAROW_AVX2 +#endif + // The following are available for AVX2 Visual C and clangcl 32 bit: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ @@ -880,9 +889,12 @@ void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb, void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width); void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width); +void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width); void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width); void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a, int width); +void ARGBExtractAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_a, + int width); void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2abd17d39..a2f6a656a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1625 +#define LIBYUV_VERSION 1626 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 45bae1ffd..3838759be 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2702,6 +2702,12 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride, : ARGBExtractAlphaRow_Any_SSE2; } #endif +#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 + : ARGBExtractAlphaRow_Any_AVX2; + } +#endif #if defined(HAS_ARGBEXTRACTALPHAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON diff --git a/source/row_any.cc b/source/row_any.cc index a3520b4ec..f9318355e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -474,6 +474,9 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 32) +#endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index bf9ddde42..86810514d 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2860,6 +2860,47 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) { } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 +#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 +static const uvec8 kShuffleAlphaShort_AVX2 = { + 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, + 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u +}; + +void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) { + asm volatile ( + "vmovdqa %3,%%ymm4 \n" + "vbroadcastf128 %4,%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ", %%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n" + "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n" + "lea " MEMLEA(0x80, 0) ", %0 \n" + "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. + "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x20, %2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_a), // %1 + "+rm"(width) // %2 + : "m"(kPermdARGBToY_AVX), // %3 + "m"(kShuffleAlphaShort_AVX2) // %4 + : "memory", "cc" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_ARGBEXTRACTALPHAROW_AVX2 + #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {