Convert8To16 use VPSRLW instead of VPMULHUW for better lunarlake performance

- MCA says old version was 4 cycles and new version is 2.5 cycles/loop - lunarlake is the only known cpu mca -mcpu=lunarlake 100 iterations Was vpmulhu Iterations: 100 Instructions: 1200 Total Cycles: 426 Total uOps: 1200 Dispatch Width: 8 uOps Per Cycle: 2.82 IPC: 2.82 Block RThroughput: 4.0 Now vpsrlw Iterations: 100 Instructions: 1200 Total Cycles: 279 Total uOps: 1400 Dispatch Width: 8 uOps Per Cycle: 5.02 IPC: 4.30 Block RThroughput: 2.5 Bug: None Change-Id: I5a49e1cf1ed3dfb59fe9861a871df9862417c6a6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6697745 Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-06 16:56:55 +08:00 · 2025-08-01 16:19:12 -07:00 · 2025-08-01 16:19:12 -07:00 · 48943bb378
commit 48943bb378
parent cdd3bae848
5 changed files with 26 additions and 27 deletions
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -5106,31 +5106,30 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                          uint16_t* dst_y,
                          int scale,
                          int width) {
-  asm volatile(
-      "vmovd       %3,%%xmm2                     \n"
-      "vpbroadcastw %%xmm2,%%ymm2                \n"
+  const int shift = __builtin_clz(scale) - 15;
+      asm volatile("vmovd       %3,%%xmm2                     \n"

-      // 32 pixels per loop.
-      LABELALIGN
+               // 32 pixels per loop.
+               LABELALIGN
      "1:          \n"
      "vmovdqu     (%0),%%ymm0                   \n"
      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
      "add         $0x20,%0                      \n"
      "vpunpckhbw  %%ymm0,%%ymm0,%%ymm1          \n"
      "vpunpcklbw  %%ymm0,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm2,%%ymm0,%%ymm0          \n"
-      "vpmulhuw    %%ymm2,%%ymm1,%%ymm1          \n"
+      "vpsrlw      %%xmm2,%%ymm0,%%ymm0          \n"
+      "vpsrlw      %%xmm2,%%ymm1,%%ymm1          \n"
      "vmovdqu     %%ymm0,(%1)                   \n"
      "vmovdqu     %%ymm1,0x20(%1)               \n"
      "add         $0x40,%1                      \n"
      "sub         $0x20,%2                      \n"
      "jg          1b                            \n"
      "vzeroupper  \n"
-      : "+r"(src_y),  // %0
-        "+r"(dst_y),  // %1
-        "+r"(width)   // %2
-      : "r"(scale)    // %3
-      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+               : "+r"(src_y),  // %0
+                 "+r"(dst_y),  // %1
+                 "+r"(width)   // %2
+               : "r"(shift)    // %3
+               : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_CONVERT8TO16ROW_AVX2

--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -3963,7 +3963,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
                          uint8_t* dst_y,
                          int scale,
                          int width) {
-  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+  const int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
  asm volatile(
      "vdup.16     q2, %3                        \n"
      "1:          \n"
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -3983,8 +3983,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
                         ptrdiff_t src_stride,
                         int dst_width,
                         int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
+  const int y1_fraction = source_y_fraction;
+  const int y0_fraction = 256 - y1_fraction;
  const uint8_t* src_ptr1 = src_ptr + src_stride;
  asm volatile(
      "cmp         %w4, #0                       \n"
@ -4119,10 +4119,10 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
                               int scale,
                               int dst_width,
                               int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
+  const int y1_fraction = source_y_fraction;
+  const int y0_fraction = 256 - y1_fraction;
  const uint16_t* src_ptr1 = src_ptr + src_stride;
-  int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
+  const int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr

  asm volatile(
      "dup         v6.8h, %w6                    \n"
@ -5529,7 +5529,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
  // saturate, then we can just use UZP2 to narrow rather than a pair of
  // saturating narrow instructions.
-  int shift = 23 - __builtin_clz((int32_t)scale);
+  const int shift = 23 - __builtin_clz((int32_t)scale);
  asm volatile(
      "dup         v2.8h, %w3                    \n"
      "1:          \n"
@ -5591,7 +5591,7 @@ void Convert8To16Row_NEON(const uint8_t* src_y,
  // (src * 0x0101 * scale) >> 16.
  // Since scale is a power of two, compute the shift to use to avoid needing
  // to widen to int32.
-  int shift = 15 - __builtin_clz(scale);
+  const int shift = 15 - __builtin_clz(scale);
  asm volatile(
      "dup         v2.8h, %w[shift]                 \n"
      "1:          \n"
--- a/source/row_sme.cc
+++ b/source/row_sme.cc
@ -569,7 +569,7 @@ __arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
  // saturate, then we can just use UZP2 to narrow rather than a pair of
  // saturating narrow instructions.
-  int shift = 23 - __builtin_clz((int32_t)scale);
+  const int shift = 23 - __builtin_clz((int32_t)scale);
  int vl;
  asm volatile(
      "cntb     %x[vl]                                  \n"
@ -917,7 +917,7 @@ __arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr,
  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
  // saturate, then we can just use UZP2 to narrow rather than a pair of
  // saturating narrow instructions.
-  int shift = 23 - __builtin_clz((int32_t)scale);
+  const int shift = 23 - __builtin_clz((int32_t)scale);

  int vl;
  asm volatile(
@ -977,8 +977,8 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
                                                      int scale,
                                                      int width,
                                                      int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
-  int y0_fraction = 256 - y1_fraction;
+  const int y1_fraction = source_y_fraction;
+  const int y0_fraction = 256 - y1_fraction;
  const uint16_t* src_ptr1 = src_ptr + src_stride;

  // y0_fraction == 0 is never called here.
@ -994,7 +994,7 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
  // 15 - clz(scale), + 8 to shift result into the high half of the lane to
  // saturate, then we can just use UZP2 to narrow rather than a pair of
  // saturating narrow instructions.
-  int shift = 23 - __builtin_clz((int32_t)scale);
+  const int shift = 23 - __builtin_clz((int32_t)scale);

  int vl;
  asm volatile(
@ -1085,7 +1085,7 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
  // (src * 0x0101 * scale) >> 16.
  // Since scale is a power of two, compute the shift to use to avoid needing
  // to widen to int32.
-  int shift = __builtin_clz(scale) - 15;
+  const int shift = __builtin_clz(scale) - 15;

  uint64_t vl;
  asm volatile(
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -184,7 +184,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
 // 32 bit
 #else  // defined(_M_X64)

-// if HAS_ARGBTOUVROW_SSSE3
+// ifdef HAS_ARGBTOUVROW_SSSE3

 // 8 bit fixed point 0.5, for bias of UV.
 static const ulvec8 kBiasUV128 = {