mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
HalfFloat_SSE2 port from C algorithm to SSE2
Low level support for 12 bit 420, 422 and 444 YUV video frame conversion. BUG=libyuv:560, chromium:445071 TEST=untested R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2381493006 .
This commit is contained in:
parent
7fc932ddd3
commit
4a14cb2e81
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1621
|
||||
Version: 1622
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -262,6 +262,13 @@ extern "C" {
|
||||
#define HAS_I422TOARGBROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available on gcc x86 platforms:
|
||||
// TODO(fbarchard): Port to Visual C.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_HALFFLOATROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
@ -1938,6 +1945,9 @@ void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
const uint8* luma, uint32 lumacoeff);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1621
|
||||
#define LIBYUV_VERSION 1622
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -2470,6 +2470,14 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_y = 0;
|
||||
}
|
||||
#if defined(HAS_HALFFLOATROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
HalfFloatRow = HalfFloatRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_AVX2;
|
||||
@ -2478,6 +2486,14 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_AVX)
|
||||
if (TestCpuFlag(kCpuHasAVX)) {
|
||||
// HalfFloatRow = HalfFloatRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
HalfFloatRow = HalfFloatRow_AVX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
HalfFloatRow(src_y, dst_y, scale, width);
|
||||
|
||||
@ -562,6 +562,9 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
|
||||
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
|
||||
#endif
|
||||
|
||||
@ -5366,6 +5366,43 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
float mult = 1.9259299444e-34f * scale;
|
||||
asm volatile (
|
||||
"movd %3,%%xmm4 \n"
|
||||
"pshufd $0x0,%%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu " MEMACCESS(0) ",%%xmm0 \n" // 8 shorts
|
||||
"lea " MEMLEA(0x10,0) ",%0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklwd %%xmm5,%%xmm0 \n" // 8 ints in xmm0/1
|
||||
"cvtdq2ps %%xmm0,%%xmm0 \n" // 8 floats
|
||||
"punpckhwd %%xmm5,%%xmm1 \n"
|
||||
"cvtdq2ps %%xmm1,%%xmm1 \n"
|
||||
"mulps %%xmm4,%%xmm0 \n"
|
||||
"mulps %%xmm4,%%xmm1 \n"
|
||||
"psrld $0xd,%%xmm0 \n"
|
||||
"psrld $0xd,%%xmm1 \n"
|
||||
"packssdw %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "rm"(mult) // %3
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm1", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_SSE2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
@ -5394,7 +5431,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
"+r"(width) // %2
|
||||
: "x"(scale) // %3
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm4"
|
||||
"xmm0", "xmm1", "xmm4"
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user