diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b810221ec..7bbad513c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -140,6 +140,7 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 +#define HAS_HALFFLOATROW_SSE2 // Effects: #define HAS_ARGBADDROW_SSE2 @@ -262,13 +263,6 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #endif -// The following are available on gcc x86 platforms: -// TODO(fbarchard): Port to Visual C. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -#define HAS_HALFFLOATROW_SSE2 -#endif - // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 67e01f45c..53248ee34 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2486,15 +2486,6 @@ int HalfFloatPlane(const uint16* src_y, int src_stride_y, } } #endif -#if defined(HAS_HALFFLOATROW_AVX) - if (TestCpuFlag(kCpuHasAVX)) { -// HalfFloatRow = HalfFloatRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { - HalfFloatRow = HalfFloatRow_AVX; - } - } -#endif - for (y = 0; y < height; ++y) { HalfFloatRow(src_y, dst_y, scale, width); src_y += src_stride_y; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index c3bd148e1..6522ac56c 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5367,38 +5367,37 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_AVX2 #ifdef HAS_HALFFLOATROW_SSE2 +static float kScaleBias = 1.9259299444e-34f; void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { - float mult = 1.9259299444e-34f * scale; asm volatile ( - "movd %3,%%xmm4 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%3,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" // 8 shorts + "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm5,%%xmm0 \n" // 8 ints in xmm0/1 - "cvtdq2ps %%xmm0,%%xmm0 \n" // 8 floats - "punpckhwd %%xmm5,%%xmm1 \n" - "cvtdq2ps %%xmm1,%%xmm1 \n" - "mulps %%xmm4,%%xmm0 \n" - "mulps %%xmm4,%%xmm1 \n" - "psrld $0xd,%%xmm0 \n" - "psrld $0xd,%%xmm1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 + "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats + "punpckhwd %%xmm5,%%xmm3 \n" + "cvtdq2ps %%xmm3,%%xmm3 \n" + "mulps %%xmm4,%%xmm2 \n" + "mulps %%xmm4,%%xmm3 \n" + "psrld $0xd,%%xmm2 \n" + "psrld $0xd,%%xmm3 \n" + "packssdw %%xmm3,%%xmm2 \n" + "movdqu %%xmm2," MEMACCESS(1) " \n" + "lea " MEMLEA(0x10,1) ",%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "rm"(mult) // %3 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "x"(scale * kScaleBias) // %3 : "memory", "cc", - "xmm0", "xmm1", "xmm4", "xmm5" + "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_HALFFLOATROW_SSE2 @@ -5411,17 +5410,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { // 16 pixel loop. LABELALIGN "1: \n" - "vpmovzxwd " MEMACCESS(0) ",%%ymm0 \n" // 8 shorts -> 8 ints - "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm1 \n" // 8 more + "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 8 shorts -> 8 ints + "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" // 8 more "lea " MEMLEA(0x20,0) ",%0 \n" - "vcvtdq2ps %%ymm0,%%ymm0 \n" - "vcvtdq2ps %%ymm1,%%ymm1 \n" - "vmulps %%ymm0,%%ymm4,%%ymm0 \n" - "vmulps %%ymm1,%%ymm4,%%ymm1 \n" - "vcvtps2ph $3, %%ymm0, %%xmm0 \n" - "vcvtps2ph $3, %%ymm1, %%xmm1 \n" - "vmovdqu %%xmm0," MEMACCESS(1) " \n" - "vmovdqu %%xmm1," MEMACCESS2(0x10,1) " \n" + "vcvtdq2ps %%ymm2,%%ymm2 \n" + "vcvtdq2ps %%ymm3,%%ymm3 \n" + "vmulps %%ymm2,%%ymm4,%%ymm2 \n" + "vmulps %%ymm3,%%ymm4,%%ymm3 \n" + "vcvtps2ph $3, %%ymm2, %%xmm2 \n" + "vcvtps2ph $3, %%ymm3, %%xmm3 \n" + "vmovdqu %%xmm2," MEMACCESS(1) " \n" + "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" "lea " MEMLEA(0x20,1) ",%1 \n" "sub $0x10,%2 \n" "jg 1b \n" @@ -5431,7 +5430,7 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { "+r"(width) // %2 : "x"(scale) // %3 : "memory", "cc", - "xmm0", "xmm1", "xmm4" + "xmm2", "xmm3", "xmm4" ); } #endif // HAS_HALFFLOATROW_AVX2 diff --git a/source/row_win.cc b/source/row_win.cc index baf6c940a..d2da0e439 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6095,6 +6095,42 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 +#ifdef HAS_HALFFLOATROW_SSE2 +static float kExpBias = 1.9259299444e-34f; +__declspec(naked) +void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { + __asm { + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ + movd xmm4, dword ptr [esp + 12] /* scale */ + mov ecx, [esp + 16] /* width */ + mulss xmm4, kExpBias + pshufd xmm4, xmm4, 0 + pxor xmm5, xmm5 + + // 8 pixel loop. + convertloop: + movdqu xmm2, xmmword ptr [eax] // 8 shorts + lea eax, [eax + 16] + movdqa xmm3, xmm2 + punpcklwd xmm2, xmm5 + cvtdq2ps xmm2, xmm2 // convert 8 ints to floats + punpckhwd xmm3, xmm5 + cvtdq2ps xmm3, xmm3 + mulps xmm2, xmm4 + mulps xmm3, xmm4 + psrld xmm2, 13 + psrld xmm3, 13 + packssdw xmm2, xmm3 + movdqu [edx], xmm2 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + ret + } +} +#endif // HAS_HALFFLOATROW_SSE2 + #ifdef HAS_HALFFLOATROW_AVX2 __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { @@ -6106,17 +6142,17 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { // 8 pixel loop. convertloop: - vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints - vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts + vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints + vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts lea eax, [eax + 32] - vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats - vcvtdq2ps ymm1, ymm1 - vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1 - vmulps ymm1, ymm1, ymm4 - vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate - vcvtps2ph xmm1, ymm1, 3 - vmovdqu [edx], xmm0 - vmovdqu [edx + 16], xmm1 + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm3, ymm3 + vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 + vmulps ymm3, ymm3, ymm4 + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm3, ymm3, 3 + vmovdqu [edx], xmm2 + vmovdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 16 jg convertloop