From b99bcab7f77ebc724ed451c04e72b589a3d4acbb Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 30 Sep 2013 07:47:59 +0000 Subject: [PATCH] ARGBShuffle_AVX2 for speed up end swapping for Chrome/Java. BUG=271 TESTED=ARGBShuffle unittest R=mflodman@webrtc.org Review URL: https://webrtc-codereview.appspot.com/2320005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@804 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 6 ++++-- include/libyuv/version.h | 2 +- source/planar_functions.cc | 8 ++++---- source/row_posix.cc | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 42 insertions(+), 8 deletions(-) diff --git a/README.chromium b/README.chromium index fc0a93c32..08e3028e7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 803 +Version: 804 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 870892ed5..2e78ba0b7 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -150,11 +150,14 @@ extern "C" { #define GCC_HAS_AVX2 1 #endif // GNUC >= 4.7 #endif // __GNUC__ +// TODO(fbarchard): Test with new NaCL tool chain. Change __native_client__AVX2 +// to __native_client__ to test. #if !defined(LIBYUV_DISABLE_X86) && \ ((defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700) || \ - defined(__native_client__) || defined(__clang__) || defined(GCC_HAS_AVX2)) + defined(__native_client__AVX2) || defined(__clang__) || defined(GCC_HAS_AVX2)) // Effects: #define HAS_ARGBPOLYNOMIALROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 #endif // The following are Windows only: @@ -166,7 +169,6 @@ extern "C" { // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -#define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d161e3db4..a451d05d8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 803 +#define LIBYUV_VERSION 804 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index de7675546..594641d8b 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1592,7 +1592,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, int x; for (x = 0; x < radius + 1; ++x) { CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], 1); + boxwidth, area, &dst_argb[x * 4], 1); area += (bot_y - top_y); boxwidth += 4; } @@ -1600,15 +1600,15 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb, // Middle unclipped. int n = (width - 1) - radius - x + 1; CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, - boxwidth, area, &dst_argb[x * 4], n); + boxwidth, area, &dst_argb[x * 4], n); // Right clipped. for (x += n; x <= width - 1; ++x) { area -= (bot_y - top_y); boxwidth -= 4; CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, - cumsum_bot_row + (x - radius - 1) * 4, - boxwidth, area, &dst_argb[x * 4], 1); + cumsum_bot_row + (x - radius - 1) * 4, + boxwidth, area, &dst_argb[x * 4], 1); } dst_argb += dst_stride_argb; } diff --git a/source/row_posix.cc b/source/row_posix.cc index ff2718853..56b34ff45 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -5726,6 +5726,38 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_SSSE3 +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, + const uint8* shuffler, int pix) { + asm volatile ( + "vmovdqa "MEMACCESS(3)",%%xmm5 \n" + "vpermq $0x44,%%ymm5,%%ymm5 \n" + + ".p2align 4 \n" + "1: \n" + "vmovdqu "MEMACCESS(0)",%%ymm0 \n" + "vmovdqu "MEMACCESS2(0x20,0)",%%ymm1 \n" + "lea "MEMLEA(0x40,0)",%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "sub $0x10,%2 \n" + "vmovdqu %%ymm0,"MEMACCESS(1)" \n" + "vmovdqu %%ymm1,"MEMACCESS2(0x20,1)" \n" + "lea "MEMLEA(0x40,1)",%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : "r"(shuffler) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_ARGBSHUFFLEROW_AVX2 + #ifdef HAS_I422TOYUY2ROW_SSE2 void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u,