mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-15 22:59:53 +08:00
Unattenutate with work around for vpgatherdd on avx2
BUG=none TEST=ARGBUnattenuate* Review URL: https://webrtc-codereview.appspot.com/1183004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@599 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
11a524362d
commit
805fefb9d8
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 597
|
Version: 599
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 597
|
#define LIBYUV_VERSION 599
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||||
|
|||||||
@ -3636,8 +3636,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
|||||||
uintptr_t alpha = 0;
|
uintptr_t alpha = 0;
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
||||||
"pslld $0x18,%%xmm4 \n"
|
|
||||||
|
|
||||||
// 4 pixel loop.
|
// 4 pixel loop.
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
|
|||||||
@ -4371,8 +4371,6 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
|||||||
mov edx, [esp + 8 + 8] // dst_argb
|
mov edx, [esp + 8 + 8] // dst_argb
|
||||||
mov ecx, [esp + 8 + 12] // width
|
mov ecx, [esp + 8 + 12] // width
|
||||||
sub edx, eax
|
sub edx, eax
|
||||||
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
|
||||||
pslld xmm4, 24
|
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
@ -4416,6 +4414,9 @@ static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
|
|||||||
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
|
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
|
||||||
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
|
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
|
||||||
};
|
};
|
||||||
|
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
|
||||||
|
// USE_GATHER is not on by default, due to being a slow instruction.
|
||||||
|
#ifdef USE_GATHER
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
@ -4449,6 +4450,70 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else // USE_GATHER
|
||||||
|
__declspec(naked) __declspec(align(16))
|
||||||
|
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||||
|
int width) {
|
||||||
|
__asm {
|
||||||
|
|
||||||
|
mov eax, [esp + 4] // src_argb0
|
||||||
|
mov edx, [esp + 8] // dst_argb
|
||||||
|
mov ecx, [esp + 12] // width
|
||||||
|
sub edx, eax
|
||||||
|
vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
|
||||||
|
|
||||||
|
push esi
|
||||||
|
push edi
|
||||||
|
|
||||||
|
align 16
|
||||||
|
convertloop:
|
||||||
|
// replace VPGATHER
|
||||||
|
movzx esi, byte ptr [eax + 3] // alpha0
|
||||||
|
movzx edi, byte ptr [eax + 7] // alpha1
|
||||||
|
vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
|
||||||
|
vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
|
||||||
|
movzx esi, byte ptr [eax + 11] // alpha2
|
||||||
|
movzx edi, byte ptr [eax + 15] // alpha3
|
||||||
|
vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
|
||||||
|
vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
|
||||||
|
vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
|
||||||
|
movzx esi, byte ptr [eax + 19] // alpha4
|
||||||
|
movzx edi, byte ptr [eax + 23] // alpha5
|
||||||
|
vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
|
||||||
|
vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
|
||||||
|
vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
|
||||||
|
movzx esi, byte ptr [eax + 27] // alpha6
|
||||||
|
movzx edi, byte ptr [eax + 31] // alpha7
|
||||||
|
vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
|
||||||
|
vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
|
||||||
|
vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
|
||||||
|
vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
|
||||||
|
vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
|
||||||
|
vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
|
||||||
|
vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
|
||||||
|
// end of VPGATHER
|
||||||
|
|
||||||
|
vmovdqu ymm6, [eax] // read 8 pixels.
|
||||||
|
vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
|
||||||
|
vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
|
||||||
|
vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
|
||||||
|
vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
|
||||||
|
vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
|
||||||
|
vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
|
||||||
|
vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
|
||||||
|
vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
|
||||||
|
vpackuswb ymm0, ymm0, ymm1 // unmutated.
|
||||||
|
sub ecx, 8
|
||||||
|
vmovdqu [eax + edx], ymm0
|
||||||
|
lea eax, [eax + 32]
|
||||||
|
jg convertloop
|
||||||
|
|
||||||
|
pop edi
|
||||||
|
pop esi
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // USE_GATHER
|
||||||
#endif // HAS_ARGBATTENUATEROW_AVX2
|
#endif // HAS_ARGBATTENUATEROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_ARGBGRAYROW_SSSE3
|
#ifdef HAS_ARGBGRAYROW_SSSE3
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user