diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 09b24f91e..df11bca16 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -201,6 +201,7 @@ extern "C" { #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 +// #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I400TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 @@ -1931,11 +1932,14 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, // Scale and convert to half float. void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale, + int width); void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width); void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale, int width); -void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width); -void HalfFloatRow_Any_SSE2(const uint16* src, uint16* dst, float scale, +void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width); +void HalfFloatRow_Any_F16C(const uint16* src, uint16* dst, float scale, int width); void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0bd76bc7c..da4c47b83 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2570,12 +2570,20 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y, } #endif #if defined(HAS_HALFFLOATROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + if (TestCpuFlag(kCpuHasAVX2)) { HalfFloatRow = HalfFloatRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { HalfFloatRow = HalfFloatRow_AVX2; } } +#endif +#if defined(HAS_HALFFLOATROW_F16C) + if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { + HalfFloatRow = HalfFloatRow_Any_F16C; + if (IS_ALIGNED(width, 16)) { + HalfFloatRow = HalfFloatRow_F16C; + } + } #endif for (y = 0; y < height; ++y) { HalfFloatRow(src_y, dst_y, scale, width); diff --git a/source/row_any.cc b/source/row_any.cc index f9318355e..6fd1b565e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -576,9 +576,11 @@ ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 1, 1, 15) #ifdef HAS_HALFFLOATROW_AVX2 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15) #endif +#ifdef HAS_HALFFLOATROW_F16C +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 1, 1, 15) +#endif #undef ANY11P16 - // Any 1 to 1 with yuvconstants #define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 86810514d..03f7f1bdc 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5341,7 +5341,43 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 +// TODO(fbarchard): consider vadddw instead of vmulps void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { + asm volatile ( + "vbroadcastss %3, %%ymm4 \n" + "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + + // 16 pixel loop. + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts + "lea " MEMLEA(0x20,0) ",%0 \n" + "vpunpckhwd %%ymm2,%%ymm5,%%ymm3 \n" + "vpunpcklwd %%ymm2,%%ymm5,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vpsrld $0xd,%%ymm3,%%ymm3 \n" + "vpsrld $0xd,%%ymm2,%%ymm2 \n" + "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates + "vmovdqu %%ymm2," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "x"(scale * kScaleBias) // %3 + : "memory", "cc", + "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "vbroadcastss %3, %%ymm4 \n" @@ -5362,6 +5398,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -5371,7 +5408,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { "xmm2", "xmm3", "xmm4" ); } -#endif // HAS_HALFFLOATROW_AVX2 +#endif // HAS_HALFFLOATROW_F16C #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. diff --git a/source/row_win.cc b/source/row_win.cc index 9dc805535..ecbee3042 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6056,13 +6056,49 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { #ifdef HAS_HALFFLOATROW_AVX2 __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + + vmulss xmm4, xmm4, kExpBias + vbroadcastss ymm4, xmm4 + vpxor ymm5, ymm5, ymm5 + + // 16 pixel loop. + convertloop: + vmovdqu ymm2, [eax] // 16 shorts + lea eax, [eax + 32] + vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints + vpunpcklwd ymm2, ymm2, ymm5 + vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats + vcvtdq2ps ymm2, ymm2 + vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. + vmulps ymm2, ymm2, ymm4 + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm2, ymm2, 13 + vpackssdw ymm2, ymm2, ymm3 + vmovdqu [edx], ymm2 + lea edx, [edx + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_HALFFLOATROW_AVX2 + +#ifdef HAS_HALFFLOATROW_F16C +__declspec(naked) +void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { __asm { mov eax, [esp + 4] /* src */ mov edx, [esp + 8] /* dst */ vbroadcastss ymm4, [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ - // 8 pixel loop. + // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts @@ -6082,7 +6118,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { ret } } -#endif // HAS_HALFFLOATROW_AVX2 +#endif // HAS_HALFFLOATROW_F16C #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table.