mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
ARGBUnattenuate_AVX2 ported to GCC. Minor cleanup of constants to use broadcast to make 16 byte constant instead of 32 byte.
BUG=269 TESTED=try bots R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/30999004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1163 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
f8c334473b
commit
91000425a3
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1162
|
||||
Version: 1163
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -205,6 +205,7 @@ extern "C" {
|
||||
#define HAS_ARGBSUBTRACTROW_AVX2
|
||||
#define HAS_ARGBMULTIPLYROW_AVX2
|
||||
#define HAS_ARGBATTENUATEROW_AVX2
|
||||
#define HAS_ARGBUNATTENUATEROW_AVX2
|
||||
#endif
|
||||
|
||||
// The following are require VS2012.
|
||||
@ -218,10 +219,7 @@ extern "C" {
|
||||
#define HAS_I422TOABGRROW_AVX2
|
||||
#define HAS_INTERPOLATEROW_AVX2
|
||||
#define HAS_MIRRORROW_AVX2
|
||||
|
||||
// Effects:
|
||||
#define HAS_ARGBMIRRORROW_AVX2
|
||||
#define HAS_ARGBUNATTENUATEROW_AVX2
|
||||
#endif // defined(VISUALC_HAS_AVX2)
|
||||
|
||||
// The following are Yasm x86 only:
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1162
|
||||
#define LIBYUV_VERSION 1163
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -3379,7 +3379,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"jge 40b \n"
|
||||
"jmp 49f \n"
|
||||
|
||||
// 4 pixel unaligned loop.
|
||||
// 4 pixel loop.
|
||||
LABELALIGN
|
||||
"41: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm3 \n"
|
||||
@ -3449,7 +3449,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
|
||||
#ifdef HAS_ARGBATTENUATEROW_SSE2
|
||||
// Attenuate 4 pixels at a time.
|
||||
// aligned to 16 bytes
|
||||
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
@ -3497,14 +3496,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
#ifdef HAS_ARGBATTENUATEROW_SSSE3
|
||||
// Shuffle table duplicating alpha
|
||||
static uvec8 kShuffleAlpha0 = {
|
||||
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
|
||||
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
|
||||
};
|
||||
static uvec8 kShuffleAlpha1 = {
|
||||
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
|
||||
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
|
||||
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
|
||||
};
|
||||
// Attenuate 4 pixels at a time.
|
||||
// aligned to 16 bytes
|
||||
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm3,%%xmm3 \n"
|
||||
@ -3551,9 +3549,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
|
||||
#ifdef HAS_ARGBATTENUATEROW_AVX2
|
||||
// Shuffle table duplicating alpha.
|
||||
static const ulvec8 kShuffleAlpha_AVX2 = {
|
||||
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
|
||||
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
|
||||
static const uvec8 kShuffleAlpha_AVX2 = {
|
||||
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
|
||||
};
|
||||
// Attenuate 8 pixels at a time.
|
||||
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
@ -3597,7 +3594,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
|
||||
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
|
||||
// Unattenuate 4 pixels at a time.
|
||||
// aligned to 16 bytes
|
||||
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
uintptr_t alpha = 0;
|
||||
@ -3647,6 +3643,80 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
}
|
||||
#endif // HAS_ARGBUNATTENUATEROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
|
||||
// Shuffle table duplicating alpha.
|
||||
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
|
||||
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
|
||||
};
|
||||
// Unattenuate 8 pixels at a time.
|
||||
void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
uintptr_t alpha = 0;
|
||||
asm volatile (
|
||||
"sub %0,%1 \n"
|
||||
"vbroadcastf128 %5,%%ymm5 \n"
|
||||
|
||||
// 8 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
// replace VPGATHER
|
||||
"movzb " MEMACCESS2(0x03,0) ",%3 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
|
||||
"movzb " MEMACCESS2(0x07,0) ",%3 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
|
||||
"movzb " MEMACCESS2(0x0b,0) ",%3 \n"
|
||||
"vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
|
||||
"movzb " MEMACCESS2(0x0f,0) ",%3 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
|
||||
"movzb " MEMACCESS2(0x13,0) ",%3 \n"
|
||||
"vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
|
||||
"movzb " MEMACCESS2(0x17,0) ",%3 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
|
||||
"movzb " MEMACCESS2(0x1b,0) ",%3 \n"
|
||||
"vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
|
||||
"movzb " MEMACCESS2(0x1f,0) ",%3 \n"
|
||||
MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
|
||||
"vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
|
||||
"vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
|
||||
"vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
|
||||
"vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
|
||||
// end of VPGATHER
|
||||
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
|
||||
"vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
|
||||
"vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
|
||||
"vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
|
||||
"vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
|
||||
"vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
|
||||
"vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
|
||||
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width), // %2
|
||||
"+r"(alpha) // %3
|
||||
: "r"(fixed_invtbl8), // %4
|
||||
"m"(kUnattenShuffleAlpha_AVX2) // %5
|
||||
: "memory", "cc"
|
||||
#if defined(__native_client__) && defined(__x86_64__)
|
||||
, "r14"
|
||||
#endif
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBUNATTENUATEROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBGRAYROW_SSSE3
|
||||
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
|
||||
void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
@ -3841,7 +3911,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
|
||||
#ifdef HAS_ARGBQUANTIZEROW_SSE2
|
||||
// Quantize 4 ARGB pixels (16 bytes).
|
||||
// aligned to 16 bytes
|
||||
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
int interval_offset, int width) {
|
||||
asm volatile (
|
||||
@ -3894,7 +3963,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
|
||||
#ifdef HAS_ARGBSHADEROW_SSE2
|
||||
// Shade 4 pixels at a time by specified value.
|
||||
// Aligned to 16 bytes.
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
asm volatile (
|
||||
|
||||
@ -72,7 +72,6 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
||||
// 64 bit
|
||||
#if defined(_M_X64)
|
||||
|
||||
// Aligned destination version.
|
||||
__declspec(align(16))
|
||||
void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
@ -165,7 +164,7 @@ static const lvec32 kPermdARGBToY_AVX = {
|
||||
// vpshufb for vphaddw + vpackuswb packed to shorts.
|
||||
static const lvec8 kShufARGBToUV_AVX = {
|
||||
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
|
||||
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
|
||||
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
|
||||
};
|
||||
|
||||
// Constants for BGRA.
|
||||
@ -1845,7 +1844,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf,
|
||||
__asm packuswb xmm2, xmm2 /* R */ \
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
@ -1888,7 +1887,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
||||
@ -1935,7 +1934,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToRAWRow_SSSE3(const uint8* y_buf,
|
||||
@ -2055,7 +2054,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
@ -2098,7 +2097,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
// Similar to I420 but duplicate UV once more.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
@ -2144,7 +2143,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
@ -2182,7 +2181,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
// 8 pixels, dest aligned 16.
|
||||
// 8 pixels.
|
||||
// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void NV21ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
@ -2423,8 +2422,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
|
||||
#ifdef HAS_MIRRORROW_AVX2
|
||||
// Shuffle table for reversing the bytes.
|
||||
static const ulvec8 kShuffleMirror_AVX2 = {
|
||||
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
|
||||
static const uvec8 kShuffleMirror_AVX2 = {
|
||||
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
||||
};
|
||||
|
||||
@ -2434,7 +2432,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
|
||||
mov eax, [esp + 4] // src
|
||||
mov edx, [esp + 8] // dst
|
||||
mov ecx, [esp + 12] // width
|
||||
vmovdqa ymm5, kShuffleMirror_AVX2
|
||||
vbroadcastf128 ymm5, kShuffleMirror_AVX2
|
||||
lea eax, [eax - 32]
|
||||
|
||||
align 4
|
||||
@ -3711,7 +3709,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
|
||||
#ifdef HAS_ARGBATTENUATEROW_SSE2
|
||||
// Attenuate 4 pixels at a time.
|
||||
// Aligned to 16 bytes.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
__asm {
|
||||
@ -3805,8 +3802,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
#ifdef HAS_ARGBATTENUATEROW_AVX2
|
||||
// Shuffle table duplicating alpha.
|
||||
static const uvec8 kShuffleAlpha_AVX2 = {
|
||||
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
|
||||
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
|
||||
6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
|
||||
};
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
@ -3846,7 +3842,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
|
||||
#ifdef HAS_ARGBUNATTENUATEROW_SSE2
|
||||
// Unattenuate 4 pixels at a time.
|
||||
// Aligned to 16 bytes.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
@ -3896,7 +3891,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
|
||||
#ifdef HAS_ARGBUNATTENUATEROW_AVX2
|
||||
// Shuffle table duplicating alpha.
|
||||
static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
|
||||
static const uvec8 kUnattenShuffleAlpha_AVX2 = {
|
||||
0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
|
||||
};
|
||||
// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
|
||||
@ -4185,7 +4180,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
|
||||
#ifdef HAS_ARGBQUANTIZEROW_SSE2
|
||||
// Quantize 4 ARGB pixels (16 bytes).
|
||||
// Aligned to 16 bytes.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
int interval_offset, int width) {
|
||||
@ -4232,7 +4226,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
|
||||
|
||||
#ifdef HAS_ARGBSHADEROW_SSE2
|
||||
// Shade 4 pixels at a time by specified value.
|
||||
// Aligned to 16 bytes.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
@ -4738,8 +4731,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
// area is the number of pixels in the area being averaged.
|
||||
// dst points to pixel to store result to.
|
||||
// count is number of averaged pixels to produce.
|
||||
// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
|
||||
// aligned.
|
||||
// Does 4 pixels at a time.
|
||||
void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
|
||||
int width, int area, uint8* dst,
|
||||
int count) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user