ShortToHalfFloat_AVX2 function

BUG=libyuv:560 TEST=local compile for windows R=wangcheng@google.com Review URL: https://codereview.chromium.org/2364293002 .
2026-01-01 03:12:16 +08:00 · 2016-09-27 14:18:32 -07:00 · 2016-09-27 14:18:32 -07:00 · 6732bcbde9
commit 6732bcbde9
parent bcd823805c
2 changed files with 35 additions and 0 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -252,6 +252,7 @@ extern "C" {
 #define HAS_ARGBTORGB565ROW_AVX2
 #define HAS_J400TOARGBROW_AVX2
 #define HAS_RGB565TOARGBROW_AVX2
+#define HAS_SHORTTOF16ROW_AVX2
 #endif

 // The following are also available on x64 Visual C.
@ -1932,6 +1933,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
                            uint8* dst_argb, const float* poly,
                            int width);

+// Scale and convert to half float.
+void ShortToF16Row_C(const uint16* src, int16* dst, float scale, int width);
+void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width);
+
 void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
                             const uint8* luma, uint32 lumacoeff);
 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -6095,6 +6095,36 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2

+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the sample range to 0 to 1 using a float multiply.
+// e.g. 9 bit scale is 1.0f / 512.0f
+// e.g. 10 bit scale is 1.0f / 1024.0f
+#ifdef HAS_SHORTTOHALFFLOAT_AVX2
+__declspec(naked)
+void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
+  __asm {
+    mov        eax, [esp + 4]      /* src */
+    mov        edx, [esp + 8]      /* dst */
+    vbroadcastss ymm4, [esp + 12]  /* scale */
+    mov        ecx, [esp + 16]     /* width */
+
+    // 8 pixel loop.
+ convertloop:
+    vpmovzxwd   ymm0, xmmword ptr [eax]  // 8 shorts -> 8 ints
+    lea         eax, [eax + 16]
+    vcvtdq2ps   ymm0, ymm0        // convert 8 ints to floats
+    vmulps      ymm0, ymm0, ymm4  // scale to normalized range 0 to 1
+    vcvtps2ph   xmm0, ymm0, 0     // float conver to 8 half floats round even
+    vmovdqu     [edx], xmm0
+    lea         edx, [edx + 16]
+    sub         ecx, 8
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_SHORTTOHALFFLOAT_AVX2
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
 __declspec(naked)