mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
HalfFloat_SSE2 for Visual C
Low level support for 12 bit 420, 422 and 444 YUV video frame conversion. BUG=libyuv:560, chromium:445071 TEST=LibYUVPlanarTest.TestHalfFloatPlane on windows R=hubbe@chromium.org, wangcheng@google.com Review URL: https://codereview.chromium.org/2387713002 .
This commit is contained in:
parent
4a14cb2e81
commit
aa197ee1a3
@ -140,6 +140,7 @@ extern "C" {
|
|||||||
#define HAS_YUY2TOUV422ROW_SSE2
|
#define HAS_YUY2TOUV422ROW_SSE2
|
||||||
#define HAS_YUY2TOUVROW_SSE2
|
#define HAS_YUY2TOUVROW_SSE2
|
||||||
#define HAS_YUY2TOYROW_SSE2
|
#define HAS_YUY2TOYROW_SSE2
|
||||||
|
#define HAS_HALFFLOATROW_SSE2
|
||||||
|
|
||||||
// Effects:
|
// Effects:
|
||||||
#define HAS_ARGBADDROW_SSE2
|
#define HAS_ARGBADDROW_SSE2
|
||||||
@ -262,13 +263,6 @@ extern "C" {
|
|||||||
#define HAS_I422TOARGBROW_SSSE3
|
#define HAS_I422TOARGBROW_SSSE3
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following are available on gcc x86 platforms:
|
|
||||||
// TODO(fbarchard): Port to Visual C.
|
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
|
||||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
|
||||||
#define HAS_HALFFLOATROW_SSE2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// The following are available on Neon platforms:
|
// The following are available on Neon platforms:
|
||||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||||
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||||
|
|||||||
@ -2486,15 +2486,6 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_HALFFLOATROW_AVX)
|
|
||||||
if (TestCpuFlag(kCpuHasAVX)) {
|
|
||||||
// HalfFloatRow = HalfFloatRow_Any_AVX2;
|
|
||||||
if (IS_ALIGNED(width, 16)) {
|
|
||||||
HalfFloatRow = HalfFloatRow_AVX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
HalfFloatRow(src_y, dst_y, scale, width);
|
HalfFloatRow(src_y, dst_y, scale, width);
|
||||||
src_y += src_stride_y;
|
src_y += src_stride_y;
|
||||||
|
|||||||
@ -5367,38 +5367,37 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|||||||
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_HALFFLOATROW_SSE2
|
#ifdef HAS_HALFFLOATROW_SSE2
|
||||||
|
static float kScaleBias = 1.9259299444e-34f;
|
||||||
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
||||||
float mult = 1.9259299444e-34f * scale;
|
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"movd %3,%%xmm4 \n"
|
"pshufd $0x0,%3,%%xmm4 \n"
|
||||||
"pshufd $0x0,%%xmm4,%%xmm4 \n"
|
|
||||||
"pxor %%xmm5,%%xmm5 \n"
|
"pxor %%xmm5,%%xmm5 \n"
|
||||||
|
|
||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n" // 8 shorts
|
"movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
|
||||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm2,%%xmm3 \n"
|
||||||
"punpcklwd %%xmm5,%%xmm0 \n" // 8 ints in xmm0/1
|
"punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
|
||||||
"cvtdq2ps %%xmm0,%%xmm0 \n" // 8 floats
|
"cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
|
||||||
"punpckhwd %%xmm5,%%xmm1 \n"
|
"punpckhwd %%xmm5,%%xmm3 \n"
|
||||||
"cvtdq2ps %%xmm1,%%xmm1 \n"
|
"cvtdq2ps %%xmm3,%%xmm3 \n"
|
||||||
"mulps %%xmm4,%%xmm0 \n"
|
"mulps %%xmm4,%%xmm2 \n"
|
||||||
"mulps %%xmm4,%%xmm1 \n"
|
"mulps %%xmm4,%%xmm3 \n"
|
||||||
"psrld $0xd,%%xmm0 \n"
|
"psrld $0xd,%%xmm2 \n"
|
||||||
"psrld $0xd,%%xmm1 \n"
|
"psrld $0xd,%%xmm3 \n"
|
||||||
"packssdw %%xmm1,%%xmm0 \n"
|
"packssdw %%xmm3,%%xmm2 \n"
|
||||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
"movdqu %%xmm2," MEMACCESS(1) " \n"
|
||||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
"sub $0x8,%2 \n"
|
"sub $0x8,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(dst), // %1
|
"+r"(dst), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "rm"(mult) // %3
|
: "x"(scale * kScaleBias) // %3
|
||||||
: "memory", "cc",
|
: "memory", "cc",
|
||||||
"xmm0", "xmm1", "xmm4", "xmm5"
|
"xmm2", "xmm3", "xmm4", "xmm5"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_HALFFLOATROW_SSE2
|
#endif // HAS_HALFFLOATROW_SSE2
|
||||||
@ -5411,17 +5410,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
|||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vpmovzxwd " MEMACCESS(0) ",%%ymm0 \n" // 8 shorts -> 8 ints
|
"vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints
|
||||||
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm1 \n" // 8 more
|
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more
|
||||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||||
"vcvtdq2ps %%ymm0,%%ymm0 \n"
|
"vcvtdq2ps %%ymm2,%%ymm2 \n"
|
||||||
"vcvtdq2ps %%ymm1,%%ymm1 \n"
|
"vcvtdq2ps %%ymm3,%%ymm3 \n"
|
||||||
"vmulps %%ymm0,%%ymm4,%%ymm0 \n"
|
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
|
||||||
"vmulps %%ymm1,%%ymm4,%%ymm1 \n"
|
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
|
||||||
"vcvtps2ph $3, %%ymm0, %%xmm0 \n"
|
"vcvtps2ph $3, %%ymm2, %%xmm2 \n"
|
||||||
"vcvtps2ph $3, %%ymm1, %%xmm1 \n"
|
"vcvtps2ph $3, %%ymm3, %%xmm3 \n"
|
||||||
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
|
"vmovdqu %%xmm2," MEMACCESS(1) " \n"
|
||||||
"vmovdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
"vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
|
||||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
@ -5431,7 +5430,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
|||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "x"(scale) // %3
|
: "x"(scale) // %3
|
||||||
: "memory", "cc",
|
: "memory", "cc",
|
||||||
"xmm0", "xmm1", "xmm4"
|
"xmm2", "xmm3", "xmm4"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif // HAS_HALFFLOATROW_AVX2
|
#endif // HAS_HALFFLOATROW_AVX2
|
||||||
|
|||||||
@ -6095,6 +6095,42 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||||
|
|
||||||
|
#ifdef HAS_HALFFLOATROW_SSE2
|
||||||
|
static float kExpBias = 1.9259299444e-34f;
|
||||||
|
__declspec(naked)
|
||||||
|
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] /* src */
|
||||||
|
mov edx, [esp + 8] /* dst */
|
||||||
|
movd xmm4, dword ptr [esp + 12] /* scale */
|
||||||
|
mov ecx, [esp + 16] /* width */
|
||||||
|
mulss xmm4, kExpBias
|
||||||
|
pshufd xmm4, xmm4, 0
|
||||||
|
pxor xmm5, xmm5
|
||||||
|
|
||||||
|
// 8 pixel loop.
|
||||||
|
convertloop:
|
||||||
|
movdqu xmm2, xmmword ptr [eax] // 8 shorts
|
||||||
|
lea eax, [eax + 16]
|
||||||
|
movdqa xmm3, xmm2
|
||||||
|
punpcklwd xmm2, xmm5
|
||||||
|
cvtdq2ps xmm2, xmm2 // convert 8 ints to floats
|
||||||
|
punpckhwd xmm3, xmm5
|
||||||
|
cvtdq2ps xmm3, xmm3
|
||||||
|
mulps xmm2, xmm4
|
||||||
|
mulps xmm3, xmm4
|
||||||
|
psrld xmm2, 13
|
||||||
|
psrld xmm3, 13
|
||||||
|
packssdw xmm2, xmm3
|
||||||
|
movdqu [edx], xmm2
|
||||||
|
lea edx, [edx + 16]
|
||||||
|
sub ecx, 8
|
||||||
|
jg convertloop
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // HAS_HALFFLOATROW_SSE2
|
||||||
|
|
||||||
#ifdef HAS_HALFFLOATROW_AVX2
|
#ifdef HAS_HALFFLOATROW_AVX2
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||||
@ -6106,17 +6142,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
|||||||
|
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
convertloop:
|
convertloop:
|
||||||
vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints
|
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
||||||
vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts
|
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
||||||
lea eax, [eax + 32]
|
lea eax, [eax + 32]
|
||||||
vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats
|
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
|
||||||
vcvtdq2ps ymm1, ymm1
|
vcvtdq2ps ymm3, ymm3
|
||||||
vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1
|
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
|
||||||
vmulps ymm1, ymm1, ymm4
|
vmulps ymm3, ymm3, ymm4
|
||||||
vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate
|
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
|
||||||
vcvtps2ph xmm1, ymm1, 3
|
vcvtps2ph xmm3, ymm3, 3
|
||||||
vmovdqu [edx], xmm0
|
vmovdqu [edx], xmm2
|
||||||
vmovdqu [edx + 16], xmm1
|
vmovdqu [edx + 16], xmm3
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
sub ecx, 16
|
sub ecx, 16
|
||||||
jg convertloop
|
jg convertloop
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user