Use vbroadcastf128 to copy m128 to ymm duplicating the value to high and low 128 bits. Allows shared variables.

BUG=none
TEST=avx2 unittests still pass.
R=mflodman@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/2324004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@803 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2013-09-30 06:49:10 +00:00
parent 0d19fc5ed3
commit 446f91d040
4 changed files with 22 additions and 70 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 802
Version: 803
License: BSD
License File: LICENSE

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 802
#define LIBYUV_VERSION 803
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -5880,14 +5880,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly,
int width) {
asm volatile (
"vmovdqu "MEMACCESS(3)",%%xmm4 \n"
"vmovdqu "MEMACCESS2(0x10,3)",%%xmm5 \n"
"vmovdqu "MEMACCESS2(0x20,3)",%%xmm6 \n"
"vmovdqu "MEMACCESS2(0x30,3)",%%xmm7 \n"
"vpermq $0x44,%%ymm4,%%ymm4 \n"
"vpermq $0x44,%%ymm5,%%ymm5 \n"
"vpermq $0x44,%%ymm6,%%ymm6 \n"
"vpermq $0x44,%%ymm7,%%ymm7 \n"
"vbroadcastf128 "MEMACCESS(3)",%%ymm4 \n"
"vbroadcastf128 "MEMACCESS2(0x10,3)",%%ymm5 \n"
"vbroadcastf128 "MEMACCESS2(0x20,3)",%%ymm6 \n"
"vbroadcastf128 "MEMACCESS2(0x30,3)",%%ymm7 \n"
// 2 pixel loop.
".p2align 4 \n"

View File

@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
};
static const lvec8 kARGBToY_AVX = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
static const lvec8 kARGBToYJ_AVX = {
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0,
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
};
static const vec8 kARGBToU = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = {
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
};
// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
static const lvec8 kARGBToU_AVX = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
static const vec8 kARGBToV = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = {
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
};
static const lvec8 kARGBToV_AVX = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
};
// vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kShufARGBToY_AVX = {
static const lvec32 kPermdARGBToY_AVX = {
0, 4, 1, 5, 2, 6, 3, 7
};
@ -124,16 +103,6 @@ static const uvec8 kAddY16 = {
static const vec16 kAddYJ64 = {
64, 64, 64, 64, 64, 64, 64, 64
};
static const lvec16 kAddYJ64_AVX = {
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
};
static const ulvec8 kAddY16_AVX = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
};
static const uvec8 kAddUV128 = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = {
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
};
static const ulvec8 kAddUV128_AVX = {
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
};
// Shuffle table for converting RGB24 to ARGB.
static const uvec8 kShuffleMaskRGB24ToARGB = {
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
@ -737,9 +699,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
vmovdqa ymm6, kShufARGBToY_AVX
vmovdqa ymm5, kAddY16_AVX
vmovdqa ymm4, kARGBToY_AVX
vbroadcastf128 ymm4, kARGBToY
vbroadcastf128 ymm5, kAddY16
vmovdqa ymm6, kPermdARGBToY_AVX
align 16
convertloop:
@ -777,9 +739,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_y */
mov ecx, [esp + 12] /* pix */
vmovdqa ymm4, kARGBToYJ_AVX
vmovdqa ymm5, kAddYJ64_AVX
vmovdqa ymm6, kShufARGBToY_AVX
vbroadcastf128 ymm4, kARGBToYJ
vbroadcastf128 ymm5, kAddYJ64
vmovdqa ymm6, kPermdARGBToY_AVX
align 16
convertloop:
@ -1229,9 +1191,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
vmovdqa ymm7, kARGBToU_AVX
vmovdqa ymm6, kARGBToV_AVX
vmovdqa ymm5, kAddUV128_AVX
vbroadcastf128 ymm5, kAddUV128
vbroadcastf128 ymm6, kARGBToV
vbroadcastf128 ymm7, kARGBToU
sub edi, edx // stride from u to v
align 16
@ -6640,8 +6602,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_bayer
mov ecx, [esp + 12] // shuffler
vmovdqa xmm5, [ecx]
vpermq ymm5, ymm5, 0x44 // same shuffle in high as low.
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // pix
align 16
@ -6825,18 +6786,13 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
uint8* dst_argb, const float* poly,
int width) {
__asm {
mov eax, [esp + 12] /* poly */
vmovdqu xmm4, [eax] // C0
vmovdqu xmm5, [eax + 16] // C1
vmovdqu xmm6, [eax + 32] // C2
vmovdqu xmm7, [eax + 48] // C3
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
vpermq ymm5, ymm5, 0x44
vpermq ymm6, ymm6, 0x44
vpermq ymm7, ymm7, 0x44
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* poly */
vbroadcastf128 ymm4, [ecx] // C0
vbroadcastf128 ymm5, [ecx + 16] // C1
vbroadcastf128 ymm6, [ecx + 32] // C2
vbroadcastf128 ymm7, [ecx + 48] // C3
mov ecx, [esp + 16] /* width */
// 2 pixel loop.