libyuv/source/scale_gcc.cc
Frank Barchard 62c19d062d [libyuv] Remove all x86 SSE optimizations
Removed all SSE functions, macros, dispatching logic, and related
unit tests across the repository to reduce code size and complexity.
Left cpuid detection intact. Supported architectures like AVX2, NEON,
SVE, etc. are unaffected.

R=rrwinterton@gmail.com

Bug: None
Test: Build and run libyuv_unittest
Change-Id: Id19608dba35b79c4c8fc31f920a6a968883d300f
2026-04-29 16:56:03 -07:00

1131 lines
49 KiB
C++

/*
* Copyright 2013 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/row.h"
#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && \
!defined(LIBYUV_ENABLE_ROWWIN)
// Offsets for source bytes 0 to 9
static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
128, 128, 128, 128, 128, 128, 128, 128};
// Offsets for source bytes 0 to 10
static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7,
8, 9, 9, 10, 10, 11, 12, 13};
// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
10, 11, 12, 13, 13, 14, 14, 15};
// Coefficients for source bytes 0 to 10
static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
// Coefficients for source bytes 10 to 21
static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
// Coefficients for source bytes 21 to 31
static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
// Coefficients for source bytes 21 to 31
static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128};
static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
6, 8, 11, 14, 128, 128, 128, 128};
// Arrange words 0,3,6 into 0,1,2
static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128};
// Arrange words 0,3,6 into 3,4,5
static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
6, 7, 12, 13, 128, 128, 128, 128};
// Scaling values for boxes of 3x3 and 2x3
static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
65536 / 9, 65536 / 6, 0, 0};
// Arrange first value for pixels 0,1,2,3,4,5
static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
11, 128, 14, 128, 128, 128, 128, 128};
// Arrange second value for pixels 0,1,2,3,4,5
static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
12, 128, 15, 128, 128, 128, 128, 128};
// Arrange third value for pixels 0,1,2,3,4,5
static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
13, 128, 128, 128, 128, 128, 128, 128};
// Scaling values for boxes of 3x2 and 2x2
static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
65536 / 3, 65536 / 2, 0, 0};
// GCC versions of row functions are verbatim conversions from Visual C.
// Generated using gcc disassembly on Visual C object file:
// objdump -D yuvscaler.obj >yuvscaler.txt
#ifdef HAS_SCALEROWDOWN2_AVX2
void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpabsb %%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
}
void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpabsb %%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
"vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
"lea 0x40(%0),%0 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vpsrlw $0x1,%%ymm0,%%ymm0 \n"
"vpsrlw $0x1,%%ymm1,%%ymm1 \n"
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEROWDOWN2_AVX2
#ifdef HAS_SCALEROWDOWN4_AVX2
void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpand %%ymm5,%%ymm0,%%ymm0 \n"
"vpand %%ymm5,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpabsw %%ymm4,%%ymm5 \n"
"vpabsb %%ymm4,%%ymm4 \n" // 0x0101
"vpsllw $0x3,%%ymm5,%%ymm5 \n" // 0x0008
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n"
"vmovdqu 0x20(%0,%3,1),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vmovdqu 0x00(%0,%3,2),%%ymm2 \n"
"vmovdqu 0x20(%0,%3,2),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vmovdqu 0x00(%0,%4,1),%%ymm2 \n"
"vmovdqu 0x20(%0,%4,1),%%ymm3 \n"
"lea 0x40(%0),%0 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
"vpsrlw $0x4,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(src_stride * 3) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEROWDOWN4_AVX2
static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
10, 11, 8, 9, 14, 15, 12, 13};
static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
#ifdef HAS_SCALEROWUP2_LINEAR_AVX2
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vbroadcastf128 %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kLinearMadd31) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEROWUP2_BILINEAR_AVX2
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vbroadcastf128 %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
"vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
"vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
"vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
"vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
// ymm0 ymm1
// ymm2 ymm3
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kLinearMadd31) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALEROWUP2_LINEAR_12_AVX2
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b)
"vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0
"vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near)
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
"vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far)
"vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2
"vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2
"vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2
"vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2
"vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
"vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
"vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm2,32(%1) \n"
"lea 0x20(%0),%0 \n"
"lea 0x40(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kLinearShuffleFar) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif
#ifdef HAS_SCALEROWUP2_BILINEAR_12_AVX2
void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
"vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
"vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
"vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
"vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1)
"vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near)
"vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far)
"vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far
"vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near
"vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2)
"vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
"vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1) \n" // store above
"vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
"vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
"vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kLinearShuffleFar) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif
#ifdef HAS_SCALEROWUP2_LINEAR_16_AVX2
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
"vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
"vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
"vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
"vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
"vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
"vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
"vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
"vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
"vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
"vpshufd $0b11011000,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEROWUP2_BILINEAR_16_AVX2
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v)
"vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
"vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
"vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far)
"vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far)
"vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
"vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
"vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo)
"vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi)
"vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v)
"vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v)
"vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
"vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
"vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far)
"vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far)
"vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
"vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
"vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
"vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
"vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo)
"vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi)
"vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
"vpshufd $0b11011000,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
"vpshufd $0b11011000,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
// Reads 16xN bytes and produces 16 shorts at a time.
#ifdef HAS_SCALEADDROW_AVX2
// Reads 32 bytes and accumulates to 32 shorts at a time.
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm3 \n"
"lea 0x20(%0),%0 \n" // src_ptr += 32
"vpermq $0xd8,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
"vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
"vpaddusw (%1),%%ymm2,%%ymm0 \n"
"vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"lea 0x40(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEADDROW_AVX2
// Constant for making pixels signed to avoid pmaddubsw
// saturation.
static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
// Constant for making pixels unsigned and adding .5 for rounding.
static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
0x4040, 0x4040, 0x4040, 0x4040};
// Bilinear column filtering. SSSE3 version.
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
// Reads 4 pixels at a time.
// Alignment requirement: dst_argb 16 byte aligned.
// Blends four 2x2 to 4x1.
// Alignment requirement: dst_argb 16 byte aligned.
// Reads 4 pixels, duplicates them and writes 8 pixels.
// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
static const uvec8 kShuffleColARGB = {
0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
};
// Shuffle table for duplicating 2 fractions into 8 bytes each
static const uvec8 kShuffleFractions = {
0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
};
// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
"sbb $0x0,%%edx \n"
"sub $0x1,%1 \n"
"idiv %1 \n"
"mov %0, %%eax \n"
: "+a"(num) // %0
: "c"(div) // %1
: "memory", "cc", "edx");
return num;
}
#if 1 || \
defined(HAS_SCALEUVROWDOWN2BOX_AVX2)
// Shuffle table for splitting UV into upper and lower part of register.
static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u,
1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u};
static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u,
6u, 14u, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80};
#endif
#ifdef HAS_SCALEUVROWDOWN2BOX_AVX2
void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
"vpabsb %%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
"vbroadcastf128 %4,%%ymm1 \n" // split shuffler
"vbroadcastf128 %5,%%ymm3 \n" // merge shuffler
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n" // 16 UV row 0
"vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1
"lea 0x20(%0),%0 \n"
"vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv
"vpshufb %%ymm1,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add
"vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round
"vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv
"vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords
"vmovdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n" // 8 UV
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"m"(kShuffleSplitUV), // %4
"m"(kShuffleMergeUV) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
3, 1, 3, 1, 1, 3, 1, 3};
#ifdef HAS_SCALEUVROWUP2_LINEAR_AVX2
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vbroadcastf128 %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kUVLinearMadd31) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEUVROWUP2_BILINEAR_AVX2
void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vbroadcastf128 %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
"vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
"vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
"vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
// ymm0 ymm1
// ymm2 ymm3
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride), // %4
"m"(kUVLinearMadd31) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALEUVROWUP2_LINEAR_16_AVX2
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
"vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
"vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
"vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
"vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
"vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo)
"vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi)
"vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi)
"vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
"vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackusdw %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEUVROWUP2_BILINEAR_16_AVX2
void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v)
"vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v)
"vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v)
"vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far)
"vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far)
"vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi)
"vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo)
"vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo)
"vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi)
"vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v)
"vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v)
"vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v)
"vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v)
"vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far)
"vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far)
"vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo)
"vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi)
"vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo)
"vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi)
"vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo)
"vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi)
"vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackusdw %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackusdw %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"(src_stride), // %3
"r"(dst_stride) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif