From 805fefb9d87bfbe00ae435d779e867c48c10d530 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 12 Mar 2013 01:59:48 +0000 Subject: [PATCH] Unattenutate with work around for vpgatherdd on avx2 BUG=none TEST=ARGBUnattenuate* Review URL: https://webrtc-codereview.appspot.com/1183004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@599 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_posix.cc | 2 -- source/row_win.cc | 69 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 69 insertions(+), 6 deletions(-) diff --git a/README.chromium b/README.chromium index d3ed2a74b..8bbd24cac 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 597 +Version: 599 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index bbf983bd2..54cb654d6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 597 +#define LIBYUV_VERSION 599 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 32b62b933..4e62b0cd2 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3636,8 +3636,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, uintptr_t alpha = 0; asm volatile ( "sub %0,%1 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" // 4 pixel loop. ".p2align 4 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 3bfad4c85..c1105d2e0 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4371,8 +4371,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, mov edx, [esp + 8 + 8] // dst_argb mov ecx, [esp + 8 + 12] // width sub edx, eax - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 align 16 convertloop: @@ -4416,6 +4414,9 @@ static const ulvec8 kUnattenShuffleAlpha_AVX2 = { 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, }; +// TODO(fbarchard): Enable USE_GATHER for future hardware if faster. +// USE_GATHER is not on by default, due to being a slow instruction. +#ifdef USE_GATHER __declspec(naked) __declspec(align(16)) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -4449,6 +4450,70 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, ret } } +#else // USE_GATHER +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 + + push esi + push edi + + align 16 + convertloop: + // replace VPGATHER + movzx esi, byte ptr [eax + 3] // alpha0 + movzx edi, byte ptr [eax + 7] // alpha1 + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] + movzx esi, byte ptr [eax + 11] // alpha2 + movzx edi, byte ptr [eax + 15] // alpha3 + vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] + movzx esi, byte ptr [eax + 19] // alpha4 + movzx edi, byte ptr [eax + 23] // alpha5 + vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] + vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] + vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] + movzx esi, byte ptr [eax + 27] // alpha6 + movzx edi, byte ptr [eax + 31] // alpha7 + vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] + vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] + vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] + vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] + vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] + vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] + vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] + // end of VPGATHER + + vmovdqu ymm6, [eax] // read 8 pixels. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a + vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a + vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpackuswb ymm0, ymm0, ymm1 // unmutated. + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // USE_GATHER #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3