mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Port HalfFloatRow_SSE2 to AVX2 but not using F16C.
R=wangcheng@google.com, hubbe@chromium.org BUG=libyuv:560 Review URL: https://codereview.chromium.org/2421993002 .
This commit is contained in:
parent
fdcf524aac
commit
2d80fc3133
@ -201,6 +201,7 @@ extern "C" {
|
||||
#define HAS_COPYROW_AVX
|
||||
#define HAS_H422TOARGBROW_AVX2
|
||||
#define HAS_HALFFLOATROW_AVX2
|
||||
// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast
|
||||
#define HAS_I400TOARGBROW_AVX2
|
||||
#define HAS_I422TOARGB1555ROW_AVX2
|
||||
#define HAS_I422TOARGB4444ROW_AVX2
|
||||
@ -1931,11 +1932,14 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
|
||||
// Scale and convert to half float.
|
||||
void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale,
|
||||
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
|
||||
@ -2570,12 +2570,20 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
HalfFloatRow = HalfFloatRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFFLOATROW_F16C)
|
||||
if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_F16C;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
HalfFloatRow = HalfFloatRow_F16C;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (y = 0; y < height; ++y) {
|
||||
HalfFloatRow(src_y, dst_y, scale, width);
|
||||
|
||||
@ -576,9 +576,11 @@ ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15)
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_F16C
|
||||
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15)
|
||||
#endif
|
||||
#undef ANY11P16
|
||||
|
||||
|
||||
// Any 1 to 1 with yuvconstants
|
||||
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
|
||||
|
||||
@ -5341,7 +5341,43 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
#endif // HAS_HALFFLOATROW_SSE2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
// TODO(fbarchard): consider vadddw instead of vmulps
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
"vbroadcastss %3, %%ymm4 \n"
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n"
|
||||
"vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n"
|
||||
"vcvtdq2ps %%ymm3,%%ymm3 \n"
|
||||
"vcvtdq2ps %%ymm2,%%ymm2 \n"
|
||||
"vmulps %%ymm3,%%ymm4,%%ymm3 \n"
|
||||
"vmulps %%ymm2,%%ymm4,%%ymm2 \n"
|
||||
"vpsrld $0xd,%%ymm3,%%ymm3 \n"
|
||||
"vpsrld $0xd,%%ymm2,%%ymm2 \n"
|
||||
"vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
|
||||
"vmovdqu %%ymm2," MEMACCESS(1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "x"(scale * kScaleBias) // %3
|
||||
: "memory", "cc",
|
||||
"xmm2", "xmm3", "xmm4", "xmm5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_F16C
|
||||
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
"vbroadcastss %3, %%ymm4 \n"
|
||||
|
||||
@ -5362,6 +5398,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
@ -5371,7 +5408,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
"xmm2", "xmm3", "xmm4"
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
#endif // HAS_HALFFLOATROW_F16C
|
||||
|
||||
#ifdef HAS_ARGBCOLORTABLEROW_X86
|
||||
// Tranform ARGB pixels with color table.
|
||||
|
||||
@ -6056,13 +6056,49 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
__declspec(naked)
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src */
|
||||
mov edx, [esp + 8] /* dst */
|
||||
movd xmm4, dword ptr [esp + 12] /* scale */
|
||||
mov ecx, [esp + 16] /* width */
|
||||
|
||||
vmulss xmm4, xmm4, kExpBias
|
||||
vbroadcastss ymm4, xmm4
|
||||
vpxor ymm5, ymm5, ymm5
|
||||
|
||||
// 16 pixel loop.
|
||||
convertloop:
|
||||
vmovdqu ymm2, [eax] // 16 shorts
|
||||
lea eax, [eax + 32]
|
||||
vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints
|
||||
vpunpcklwd ymm2, ymm2, ymm5
|
||||
vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats
|
||||
vcvtdq2ps ymm2, ymm2
|
||||
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
|
||||
vmulps ymm2, ymm2, ymm4
|
||||
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
|
||||
vpsrld ymm2, ymm2, 13
|
||||
vpackssdw ymm2, ymm2, ymm3
|
||||
vmovdqu [edx], ymm2
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_F16C
|
||||
__declspec(naked)
|
||||
void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src */
|
||||
mov edx, [esp + 8] /* dst */
|
||||
vbroadcastss ymm4, [esp + 12] /* scale */
|
||||
mov ecx, [esp + 16] /* width */
|
||||
|
||||
// 8 pixel loop.
|
||||
// 16 pixel loop.
|
||||
convertloop:
|
||||
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
||||
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
||||
@ -6082,7 +6118,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
#endif // HAS_HALFFLOATROW_F16C
|
||||
|
||||
#ifdef HAS_ARGBCOLORTABLEROW_X86
|
||||
// Tranform ARGB pixels with color table.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user