mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Use vbroadcastf128 to copy m128 to ymm duplicating the value to high and low 128 bits. Allows shared variables.
BUG=none TEST=avx2 unittests still pass. R=mflodman@webrtc.org Review URL: https://webrtc-codereview.appspot.com/2324004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@803 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
0d19fc5ed3
commit
446f91d040
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 802
|
||||
Version: 803
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 802
|
||||
#define LIBYUV_VERSION 803
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -5880,14 +5880,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
uint8* dst_argb, const float* poly,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"vmovdqu "MEMACCESS(3)",%%xmm4 \n"
|
||||
"vmovdqu "MEMACCESS2(0x10,3)",%%xmm5 \n"
|
||||
"vmovdqu "MEMACCESS2(0x20,3)",%%xmm6 \n"
|
||||
"vmovdqu "MEMACCESS2(0x30,3)",%%xmm7 \n"
|
||||
"vpermq $0x44,%%ymm4,%%ymm4 \n"
|
||||
"vpermq $0x44,%%ymm5,%%ymm5 \n"
|
||||
"vpermq $0x44,%%ymm6,%%ymm6 \n"
|
||||
"vpermq $0x44,%%ymm7,%%ymm7 \n"
|
||||
"vbroadcastf128 "MEMACCESS(3)",%%ymm4 \n"
|
||||
"vbroadcastf128 "MEMACCESS2(0x10,3)",%%ymm5 \n"
|
||||
"vbroadcastf128 "MEMACCESS2(0x20,3)",%%ymm6 \n"
|
||||
"vbroadcastf128 "MEMACCESS2(0x30,3)",%%ymm7 \n"
|
||||
|
||||
// 2 pixel loop.
|
||||
".p2align 4 \n"
|
||||
|
||||
@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = {
|
||||
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
|
||||
};
|
||||
|
||||
static const lvec8 kARGBToY_AVX = {
|
||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
|
||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
||||
};
|
||||
|
||||
static const lvec8 kARGBToYJ_AVX = {
|
||||
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0,
|
||||
15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
|
||||
};
|
||||
|
||||
static const vec8 kARGBToU = {
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
||||
};
|
||||
@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = {
|
||||
127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
|
||||
};
|
||||
|
||||
// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
|
||||
static const lvec8 kARGBToU_AVX = {
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
||||
};
|
||||
|
||||
static const vec8 kARGBToV = {
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
||||
};
|
||||
@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = {
|
||||
-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
|
||||
};
|
||||
|
||||
static const lvec8 kARGBToV_AVX = {
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
|
||||
};
|
||||
|
||||
// vpermd for vphaddw + vpackuswb vpermd.
|
||||
static const lvec32 kShufARGBToY_AVX = {
|
||||
static const lvec32 kPermdARGBToY_AVX = {
|
||||
0, 4, 1, 5, 2, 6, 3, 7
|
||||
};
|
||||
|
||||
@ -124,16 +103,6 @@ static const uvec8 kAddY16 = {
|
||||
static const vec16 kAddYJ64 = {
|
||||
64, 64, 64, 64, 64, 64, 64, 64
|
||||
};
|
||||
static const lvec16 kAddYJ64_AVX = {
|
||||
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
|
||||
};
|
||||
|
||||
static const ulvec8 kAddY16_AVX = {
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
||||
};
|
||||
|
||||
static const uvec8 kAddUV128 = {
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = {
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
|
||||
};
|
||||
|
||||
static const ulvec8 kAddUV128_AVX = {
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
||||
};
|
||||
|
||||
// Shuffle table for converting RGB24 to ARGB.
|
||||
static const uvec8 kShuffleMaskRGB24ToARGB = {
|
||||
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
||||
@ -737,9 +699,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
vmovdqa ymm6, kShufARGBToY_AVX
|
||||
vmovdqa ymm5, kAddY16_AVX
|
||||
vmovdqa ymm4, kARGBToY_AVX
|
||||
vbroadcastf128 ymm4, kARGBToY
|
||||
vbroadcastf128 ymm5, kAddY16
|
||||
vmovdqa ymm6, kPermdARGBToY_AVX
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
@ -777,9 +739,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
vmovdqa ymm4, kARGBToYJ_AVX
|
||||
vmovdqa ymm5, kAddYJ64_AVX
|
||||
vmovdqa ymm6, kShufARGBToY_AVX
|
||||
vbroadcastf128 ymm4, kARGBToYJ
|
||||
vbroadcastf128 ymm5, kAddYJ64
|
||||
vmovdqa ymm6, kPermdARGBToY_AVX
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
@ -1229,9 +1191,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
vmovdqa ymm7, kARGBToU_AVX
|
||||
vmovdqa ymm6, kARGBToV_AVX
|
||||
vmovdqa ymm5, kAddUV128_AVX
|
||||
vbroadcastf128 ymm5, kAddUV128
|
||||
vbroadcastf128 ymm6, kARGBToV
|
||||
vbroadcastf128 ymm7, kARGBToU
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
align 16
|
||||
@ -6640,8 +6602,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_bayer
|
||||
mov ecx, [esp + 12] // shuffler
|
||||
vmovdqa xmm5, [ecx]
|
||||
vpermq ymm5, ymm5, 0x44 // same shuffle in high as low.
|
||||
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
|
||||
mov ecx, [esp + 16] // pix
|
||||
|
||||
align 16
|
||||
@ -6825,18 +6786,13 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
uint8* dst_argb, const float* poly,
|
||||
int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 12] /* poly */
|
||||
vmovdqu xmm4, [eax] // C0
|
||||
vmovdqu xmm5, [eax + 16] // C1
|
||||
vmovdqu xmm6, [eax + 32] // C2
|
||||
vmovdqu xmm7, [eax + 48] // C3
|
||||
vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords
|
||||
vpermq ymm5, ymm5, 0x44
|
||||
vpermq ymm6, ymm6, 0x44
|
||||
vpermq ymm7, ymm7, 0x44
|
||||
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_argb */
|
||||
mov ecx, [esp + 12] /* poly */
|
||||
vbroadcastf128 ymm4, [ecx] // C0
|
||||
vbroadcastf128 ymm5, [ecx + 16] // C1
|
||||
vbroadcastf128 ymm6, [ecx + 32] // C2
|
||||
vbroadcastf128 ymm7, [ecx + 48] // C3
|
||||
mov ecx, [esp + 16] /* width */
|
||||
|
||||
// 2 pixel loop.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user