diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 6af2a1a9b..df0db47f1 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -5106,31 +5106,30 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile( - "vmovd %3,%%xmm2 \n" - "vpbroadcastw %%xmm2,%%ymm2 \n" + const int shift = __builtin_clz(scale) - 15; + asm volatile("vmovd %3,%%xmm2 \n" - // 32 pixels per loop. - LABELALIGN + // 32 pixels per loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "add $0x20,%0 \n" "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsrlw %%xmm2,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm2,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "add $0x40,%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(shift) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_CONVERT8TO16ROW_AVX2 diff --git a/source/row_neon.cc b/source/row_neon.cc index 359cbf40f..1f1a3bbf3 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3963,7 +3963,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + const int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr asm volatile( "vdup.16 q2, %3 \n" "1: \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 9bef8c44f..1f0d6e164 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -3983,8 +3983,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; + const int y1_fraction = source_y_fraction; + const int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( "cmp %w4, #0 \n" @@ -4119,10 +4119,10 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, int scale, int dst_width, int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; + const int y1_fraction = source_y_fraction; + const int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; - int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr + const int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr asm volatile( "dup v6.8h, %w6 \n" @@ -5529,7 +5529,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, // 15 - clz(scale), + 8 to shift result into the high half of the lane to // saturate, then we can just use UZP2 to narrow rather than a pair of // saturating narrow instructions. - int shift = 23 - __builtin_clz((int32_t)scale); + const int shift = 23 - __builtin_clz((int32_t)scale); asm volatile( "dup v2.8h, %w3 \n" "1: \n" @@ -5591,7 +5591,7 @@ void Convert8To16Row_NEON(const uint8_t* src_y, // (src * 0x0101 * scale) >> 16. // Since scale is a power of two, compute the shift to use to avoid needing // to widen to int32. - int shift = 15 - __builtin_clz(scale); + const int shift = 15 - __builtin_clz(scale); asm volatile( "dup v2.8h, %w[shift] \n" "1: \n" diff --git a/source/row_sme.cc b/source/row_sme.cc index a78f74150..bd61b20bf 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -569,7 +569,7 @@ __arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y, // 15 - clz(scale), + 8 to shift result into the high half of the lane to // saturate, then we can just use UZP2 to narrow rather than a pair of // saturating narrow instructions. - int shift = 23 - __builtin_clz((int32_t)scale); + const int shift = 23 - __builtin_clz((int32_t)scale); int vl; asm volatile( "cntb %x[vl] \n" @@ -917,7 +917,7 @@ __arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr, // 15 - clz(scale), + 8 to shift result into the high half of the lane to // saturate, then we can just use UZP2 to narrow rather than a pair of // saturating narrow instructions. - int shift = 23 - __builtin_clz((int32_t)scale); + const int shift = 23 - __builtin_clz((int32_t)scale); int vl; asm volatile( @@ -977,8 +977,8 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr, int scale, int width, int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; + const int y1_fraction = source_y_fraction; + const int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; // y0_fraction == 0 is never called here. @@ -994,7 +994,7 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr, // 15 - clz(scale), + 8 to shift result into the high half of the lane to // saturate, then we can just use UZP2 to narrow rather than a pair of // saturating narrow instructions. - int shift = 23 - __builtin_clz((int32_t)scale); + const int shift = 23 - __builtin_clz((int32_t)scale); int vl; asm volatile( @@ -1085,7 +1085,7 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y, // (src * 0x0101 * scale) >> 16. // Since scale is a power of two, compute the shift to use to avoid needing // to widen to int32. - int shift = __builtin_clz(scale) - 15; + const int shift = __builtin_clz(scale) - 15; uint64_t vl; asm volatile( diff --git a/source/row_win.cc b/source/row_win.cc index 5d4aec9cf..933efec23 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -184,7 +184,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, // 32 bit #else // defined(_M_X64) -// if HAS_ARGBTOUVROW_SSSE3 +// ifdef HAS_ARGBTOUVROW_SSSE3 // 8 bit fixed point 0.5, for bias of UV. static const ulvec8 kBiasUV128 = {