From d5ee3dc9123c9fa4e90ec6b90a5c45f8434cac3f Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 19 Feb 2013 12:26:13 +0000 Subject: [PATCH] AVX2 Attenuate BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/1101014 git-svn-id: http://libyuv.googlecode.com/svn/trunk@576 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 4 ++++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 14 +++++++++++++ source/row_win.cc | 43 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 2 deletions(-) diff --git a/README.chromium b/README.chromium index d387a73c8..89399c5e0 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 575 +Version: 576 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index bd49d9323..fd9e4c633 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -137,6 +137,9 @@ extern "C" { #define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 + +// Effects +#define HAS_ARGBATTENUATEROW_AVX2 #endif #endif @@ -1308,6 +1311,7 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index efaa94ca9..6bd523f68 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 575 +#define LIBYUV_VERSION 576 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 22407f5aa..5df56fd82 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1032,6 +1032,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { + bool clear = true; + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } +#endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; @@ -1046,6 +1053,13 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_argb += src_stride_argb; dst_argb += dst_stride_argb; } + +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (clear) { + __asm vzeroupper; + } +#endif + return 0; } diff --git a/source/row_win.cc b/source/row_win.cc index b39f227b0..d4a81a4a9 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4364,6 +4364,49 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { } #endif // HAS_ARGBATTENUATEROW_SSSE3 +#ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const ulvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm4, kShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + align 16 + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpshufb ymm2, ymm0, ymm4 // low 4 alphas + vpshufb ymm3, ymm1, ymm4 // high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * a + vpmulhuw ymm1, ymm1, ymm3 // rgb * a + vpand ymm6, ymm6, ymm5 // isolate alpha + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. // Aligned to 16 bytes.