mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Convert8To16 use VPSRLW instead of VPMULHUW for better lunarlake performance
- MCA says old version was 4 cycles and new version is 2.5 cycles/loop - lunarlake is the only known cpu mca -mcpu=lunarlake 100 iterations Was vpmulhu Iterations: 100 Instructions: 1200 Total Cycles: 426 Total uOps: 1200 Dispatch Width: 8 uOps Per Cycle: 2.82 IPC: 2.82 Block RThroughput: 4.0 Now vpsrlw Iterations: 100 Instructions: 1200 Total Cycles: 279 Total uOps: 1400 Dispatch Width: 8 uOps Per Cycle: 5.02 IPC: 4.30 Block RThroughput: 2.5 Bug: None Change-Id: I5a49e1cf1ed3dfb59fe9861a871df9862417c6a6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6697745 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
cdd3bae848
commit
48943bb378
@ -5106,31 +5106,30 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
|
|||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
const int shift = __builtin_clz(scale) - 15;
|
||||||
"vmovd %3,%%xmm2 \n"
|
asm volatile("vmovd %3,%%xmm2 \n"
|
||||||
"vpbroadcastw %%xmm2,%%ymm2 \n"
|
|
||||||
|
|
||||||
// 32 pixels per loop.
|
// 32 pixels per loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm0 \n"
|
"vmovdqu (%0),%%ymm0 \n"
|
||||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||||
"add $0x20,%0 \n"
|
"add $0x20,%0 \n"
|
||||||
"vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
|
"vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n"
|
||||||
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
|
"vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
|
||||||
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
|
"vpsrlw %%xmm2,%%ymm0,%%ymm0 \n"
|
||||||
"vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n"
|
"vpsrlw %%xmm2,%%ymm1,%%ymm1 \n"
|
||||||
"vmovdqu %%ymm0,(%1) \n"
|
"vmovdqu %%ymm0,(%1) \n"
|
||||||
"vmovdqu %%ymm1,0x20(%1) \n"
|
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||||
"add $0x40,%1 \n"
|
"add $0x40,%1 \n"
|
||||||
"sub $0x20,%2 \n"
|
"sub $0x20,%2 \n"
|
||||||
"jg 1b \n"
|
"jg 1b \n"
|
||||||
"vzeroupper \n"
|
"vzeroupper \n"
|
||||||
: "+r"(src_y), // %0
|
: "+r"(src_y), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
: "r"(scale) // %3
|
: "r"(shift) // %3
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||||
}
|
}
|
||||||
#endif // HAS_CONVERT8TO16ROW_AVX2
|
#endif // HAS_CONVERT8TO16ROW_AVX2
|
||||||
|
|
||||||
|
|||||||
@ -3963,7 +3963,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
const int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.16 q2, %3 \n"
|
"vdup.16 q2, %3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
|
|||||||
@ -3983,8 +3983,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
int dst_width,
|
int dst_width,
|
||||||
int source_y_fraction) {
|
int source_y_fraction) {
|
||||||
int y1_fraction = source_y_fraction;
|
const int y1_fraction = source_y_fraction;
|
||||||
int y0_fraction = 256 - y1_fraction;
|
const int y0_fraction = 256 - y1_fraction;
|
||||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"cmp %w4, #0 \n"
|
"cmp %w4, #0 \n"
|
||||||
@ -4119,10 +4119,10 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
|
|||||||
int scale,
|
int scale,
|
||||||
int dst_width,
|
int dst_width,
|
||||||
int source_y_fraction) {
|
int source_y_fraction) {
|
||||||
int y1_fraction = source_y_fraction;
|
const int y1_fraction = source_y_fraction;
|
||||||
int y0_fraction = 256 - y1_fraction;
|
const int y0_fraction = 256 - y1_fraction;
|
||||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||||
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
const int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
||||||
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"dup v6.8h, %w6 \n"
|
"dup v6.8h, %w6 \n"
|
||||||
@ -5529,7 +5529,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
|
|||||||
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
||||||
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
||||||
// saturating narrow instructions.
|
// saturating narrow instructions.
|
||||||
int shift = 23 - __builtin_clz((int32_t)scale);
|
const int shift = 23 - __builtin_clz((int32_t)scale);
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"dup v2.8h, %w3 \n"
|
"dup v2.8h, %w3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -5591,7 +5591,7 @@ void Convert8To16Row_NEON(const uint8_t* src_y,
|
|||||||
// (src * 0x0101 * scale) >> 16.
|
// (src * 0x0101 * scale) >> 16.
|
||||||
// Since scale is a power of two, compute the shift to use to avoid needing
|
// Since scale is a power of two, compute the shift to use to avoid needing
|
||||||
// to widen to int32.
|
// to widen to int32.
|
||||||
int shift = 15 - __builtin_clz(scale);
|
const int shift = 15 - __builtin_clz(scale);
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"dup v2.8h, %w[shift] \n"
|
"dup v2.8h, %w[shift] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
|
|||||||
@ -569,7 +569,7 @@ __arm_locally_streaming void Convert16To8Row_SME(const uint16_t* src_y,
|
|||||||
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
||||||
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
||||||
// saturating narrow instructions.
|
// saturating narrow instructions.
|
||||||
int shift = 23 - __builtin_clz((int32_t)scale);
|
const int shift = 23 - __builtin_clz((int32_t)scale);
|
||||||
int vl;
|
int vl;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"cntb %x[vl] \n"
|
"cntb %x[vl] \n"
|
||||||
@ -917,7 +917,7 @@ __arm_locally_streaming static void HalfRow_16To8_SME(uint8_t* dst_ptr,
|
|||||||
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
||||||
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
||||||
// saturating narrow instructions.
|
// saturating narrow instructions.
|
||||||
int shift = 23 - __builtin_clz((int32_t)scale);
|
const int shift = 23 - __builtin_clz((int32_t)scale);
|
||||||
|
|
||||||
int vl;
|
int vl;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@ -977,8 +977,8 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
|
|||||||
int scale,
|
int scale,
|
||||||
int width,
|
int width,
|
||||||
int source_y_fraction) {
|
int source_y_fraction) {
|
||||||
int y1_fraction = source_y_fraction;
|
const int y1_fraction = source_y_fraction;
|
||||||
int y0_fraction = 256 - y1_fraction;
|
const int y0_fraction = 256 - y1_fraction;
|
||||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||||
|
|
||||||
// y0_fraction == 0 is never called here.
|
// y0_fraction == 0 is never called here.
|
||||||
@ -994,7 +994,7 @@ __arm_locally_streaming void InterpolateRow_16To8_SME(uint8_t* dst_ptr,
|
|||||||
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
// 15 - clz(scale), + 8 to shift result into the high half of the lane to
|
||||||
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
// saturate, then we can just use UZP2 to narrow rather than a pair of
|
||||||
// saturating narrow instructions.
|
// saturating narrow instructions.
|
||||||
int shift = 23 - __builtin_clz((int32_t)scale);
|
const int shift = 23 - __builtin_clz((int32_t)scale);
|
||||||
|
|
||||||
int vl;
|
int vl;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
@ -1085,7 +1085,7 @@ __arm_locally_streaming void Convert8To16Row_SME(const uint8_t* src_y,
|
|||||||
// (src * 0x0101 * scale) >> 16.
|
// (src * 0x0101 * scale) >> 16.
|
||||||
// Since scale is a power of two, compute the shift to use to avoid needing
|
// Since scale is a power of two, compute the shift to use to avoid needing
|
||||||
// to widen to int32.
|
// to widen to int32.
|
||||||
int shift = __builtin_clz(scale) - 15;
|
const int shift = __builtin_clz(scale) - 15;
|
||||||
|
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
|||||||
@ -184,7 +184,7 @@ void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
|
|||||||
// 32 bit
|
// 32 bit
|
||||||
#else // defined(_M_X64)
|
#else // defined(_M_X64)
|
||||||
|
|
||||||
// if HAS_ARGBTOUVROW_SSSE3
|
// ifdef HAS_ARGBTOUVROW_SSSE3
|
||||||
|
|
||||||
// 8 bit fixed point 0.5, for bias of UV.
|
// 8 bit fixed point 0.5, for bias of UV.
|
||||||
static const ulvec8 kBiasUV128 = {
|
static const ulvec8 kBiasUV128 = {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user