RVV remove unused variables

- ARM Planar test use regular asm volatile syntax
- x86 row functions remove volatile from asm

Bug: 347111119, 347112532
Change-Id: I535b3dfa1a7a19824503bd95584a63b047b0e9a1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5637058
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Frank Barchard 2024-06-17 12:51:24 -07:00
parent 7758c961c5
commit b0dfa70114
6 changed files with 890 additions and 1209 deletions

View File

@ -21,14 +21,15 @@ extern "C" {
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
// "memory" clobber prevents the reads from being removed
#if defined(__x86_64__)
uint32_t HammingDistance_SSE42(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint64_t diff = 0u;
uint64_t diff;
asm volatile(
"xor %3,%3 \n"
asm("xor %3,%3 \n"
"xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n"
"xor %%r10,%%r10 \n"
@ -63,9 +64,9 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=r"(diff) // %3
"=&r"(diff) // %3
:
: "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
: "cc", "memory", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
return (uint32_t)(diff);
}
@ -75,7 +76,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) {
uint32_t diff = 0u;
asm volatile(
asm(
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
@ -104,7 +105,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
"+r"(count), // %2
"+r"(diff) // %3
:
: "memory", "cc", "ecx", "edx");
: "cc", "memory", "ecx", "edx");
return diff;
}
@ -117,10 +118,9 @@ static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff = 0u;
uint32_t diff;
asm volatile(
"movdqa %4,%%xmm2 \n"
asm("movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n"
@ -166,7 +166,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
"=r"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
return diff;
@ -176,10 +176,9 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
uint32_t HammingDistance_AVX2(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff = 0u;
uint32_t diff;
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
asm("vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
@ -214,7 +213,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0, %3 \n"
"vmovd %%xmm0,%3 \n"
"vzeroupper \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
@ -222,7 +221,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
"=r"(diff) // %3
: "m"(kNibbleMask), // %4
"m"(kBitCount) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
return diff;
}
@ -232,8 +231,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile(
"pxor %%xmm0,%%xmm0 \n"
asm("pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
LABELALIGN
@ -261,13 +259,12 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
"pshufd $0x1,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0,%3 \n"
: "+r"(src_a), // %0
"+r"(src_b), // %1
"+r"(count), // %2
"=g"(sse) // %3
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
"=r"(sse) // %3
:
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
return sse;
}
@ -299,8 +296,7 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash;
asm volatile(
"movd %2,%%xmm0 \n"
asm("movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"
@ -341,13 +337,13 @@ uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
: "+r"(src), // %0
"+r"(count), // %1
"+rm"(seed), // %2
"=g"(hash) // %3
"=r"(hash) // %3
: "m"(kHash16x33), // %4
"m"(kHashMul0), // %5
"m"(kHashMul1), // %6
"m"(kHashMul2), // %7
"m"(kHashMul3) // %8
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
: "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
return hash;
}

View File

@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
asm(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
asm(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width) {
asm volatile(
asm(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
asm(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
asm(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d

File diff suppressed because it is too large Load Diff

View File

@ -497,7 +497,6 @@ void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
vuint8m2x3_t v_dst_bgr = __riscv_vcreate_v_u8m2x3(v_r, v_g, v_b);
__riscv_vsseg3e8_v_u8m2x3(dst_raw, v_dst_bgr, vl);
w -= vl;
@ -2101,7 +2100,6 @@ void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 0);
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 1);
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 2);
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_argb, 3);
v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
@ -2191,7 +2189,6 @@ void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
vuint16m4_t v_y_u16;
size_t vl = __riscv_vsetvl_e8m2(w);
vuint8m2x4_t v_src_rgba = __riscv_vlseg4e8_v_u8m2x4(src_rgba, vl);
vuint8m2_t v_a = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 0);
vuint8m2_t v_b = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 1);
vuint8m2_t v_g = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 2);
vuint8m2_t v_r = __riscv_vget_v_u8m2x4_u8m2(v_src_rgba, 3);

View File

@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
asm(
// 16 pixel loop.
LABELALIGN
"1: \n"
@ -114,8 +114,8 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1");
:
: "memory", "cc", "xmm0", "xmm1");
}
void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
@ -123,8 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
asm("pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
@ -146,16 +145,15 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm4", "xmm5");
:
: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
}
void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
asm("pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
@ -195,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
@ -209,11 +207,11 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
@ -221,8 +219,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
@ -246,16 +243,15 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm4", "xmm5");
:
: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
}
void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
@ -297,8 +293,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n"
asm("pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
"pslld $0x10,%%xmm5 \n"
@ -319,8 +314,8 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm5");
:
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
@ -328,8 +323,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
asm("pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
"packuswb %%xmm4,%%xmm4 \n"
@ -383,8 +377,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
asm("vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n"
@ -408,16 +401,15 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm5");
:
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -472,8 +464,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"movdqa %0,%%xmm3 \n"
asm("movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
:
@ -481,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n"
@ -497,19 +488,18 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
asm("movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
@ -517,8 +507,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
asm("movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
@ -526,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -559,21 +548,20 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
asm("movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
:
@ -581,8 +569,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
asm("movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
:
@ -591,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2
);
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -627,13 +614,13 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x18(%1),%1 \n"
"sub $0x18,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"m"(kMadd21) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
@ -641,8 +628,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"movdqa %3,%%xmm4 \n"
asm("movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
LABELALIGN
@ -671,8 +657,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
asm("movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"movdqa %3,%%xmm5 \n"
@ -682,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
@ -703,20 +688,18 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x6(%1),%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %0,%%xmm2 \n"
asm("movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
@ -725,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
@ -765,12 +748,12 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"lea 0x6(%1),%1 \n"
"sub $0x6,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5,
@ -783,8 +766,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
asm("pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $1,%%xmm6 \n" // all 2
@ -839,8 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
LABELALIGN
asm(LABELALIGN
"1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0
// above line
@ -953,8 +934,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"movdqa %3,%%xmm5 \n"
asm("movdqa %3,%%xmm5 \n"
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
@ -1005,8 +985,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm7,%%xmm7 \n"
asm("pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n"
"psllw $3,%%xmm7 \n" // all 8
"movdqa %5,%%xmm6 \n"
@ -1103,8 +1082,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm5,%%xmm5 \n"
asm("pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n"
"pslld $1,%%xmm4 \n" // all 2
@ -1156,8 +1134,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pxor %%xmm7,%%xmm7 \n"
asm("pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n"
"pslld $3,%%xmm6 \n" // all 8
@ -1264,8 +1241,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
asm("pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
"movdqa %3,%%xmm3 \n"
@ -1305,8 +1281,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
asm("pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
"movdqa %5,%%xmm7 \n"
@ -1390,8 +1365,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vbroadcastf128 %3,%%ymm3 \n"
@ -1434,8 +1408,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
asm("vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vbroadcastf128 %5,%%ymm7 \n"
@ -1516,8 +1489,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
asm("vbroadcastf128 %3,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -1568,8 +1540,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
asm("vbroadcastf128 %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
@ -1630,8 +1601,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -1680,8 +1650,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
asm("vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
@ -1763,10 +1732,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile("pxor %%xmm5,%%xmm5 \n"
asm("pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
LABELALIGN
// 16 pixel loop.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n" // src_ptr += 16
@ -1782,11 +1751,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#ifdef HAS_SCALEADDROW_AVX2
@ -1794,9 +1763,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
asm("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm3 \n"
"lea 0x20(%0),%0 \n" // src_ptr += 32
@ -1811,11 +1780,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEADDROW_AVX2
@ -1835,8 +1804,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int x,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile(
"movd %6,%%xmm2 \n"
asm("movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n"
"movl $0x04040000,%k2 \n"
"movd %k2,%%xmm5 \n"
@ -1932,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) {
(void)x;
(void)dx;
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
@ -1945,11 +1913,11 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1");
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
@ -1957,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1967,11 +1935,11 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1");
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
@ -1979,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1992,18 +1960,18 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1");
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -2020,11 +1988,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
// Reads 4 pixels at a time.
@ -2037,8 +2005,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile(
"lea 0x00(,%1,4),%1 \n"
asm("lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n"
LABELALIGN
@ -2060,8 +2027,8 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
"+r"(dst_argb), // %2
"+r"(dst_width), // %3
"=&r"(src_stepx_x12) // %4
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3");
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
// Blends four 2x2 to 4x1.
@ -2074,8 +2041,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile(
"lea 0x00(,%1,4),%1 \n"
asm("lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n"
"lea 0x00(%0,%5,1),%5 \n"
@ -2107,8 +2073,8 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
"+rm"(dst_width), // %3
"=&r"(src_stepx_x12), // %4
"+r"(row1) // %5
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3");
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
void ScaleARGBCols_SSE2(uint8_t* dst_argb,
@ -2117,8 +2083,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int x,
int dx) {
intptr_t x0, x1;
asm volatile(
"movd %5,%%xmm2 \n"
asm("movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
"pshufd $0x11,%%xmm3,%%xmm0 \n"
@ -2188,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) {
(void)x;
(void)dx;
asm volatile(LABELALIGN
asm(LABELALIGN
"1: \n"
"movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
@ -2201,11 +2166,11 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width) // %2
::"memory",
"cc", "xmm0", "xmm1");
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1");
}
// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
@ -2226,16 +2191,14 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int x,
int dx) {
intptr_t x0, x1;
asm volatile(
"movdqa %0,%%xmm4 \n"
asm("movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
:
: "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1
);
asm volatile(
"movd %5,%%xmm2 \n"
asm("movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
"psrlw $0x9,%%xmm6 \n"
@ -2283,8 +2246,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"packuswb %%xmm0,%%xmm0 \n"
"movd %%xmm0,(%0) \n"
LABELALIGN
"99: \n" // clang-format error.
LABELALIGN "99: \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
@ -2298,8 +2260,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
asm volatile(
"cdq \n"
asm("cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"idiv %1 \n"
@ -2312,8 +2273,7 @@ int FixedDiv_X86(int num, int div) {
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
asm volatile(
"cdq \n"
asm("cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
"sub $0x10001,%%eax \n"
@ -2344,8 +2304,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
asm("pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5, %%xmm5 \n" // zero
@ -2384,8 +2343,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
asm("vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero
@ -2428,8 +2386,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
asm("pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
"movdqa %3,%%xmm3 \n"
@ -2469,8 +2426,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
asm("pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
"movdqa %5,%%xmm7 \n"
@ -2553,8 +2509,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vbroadcastf128 %3,%%ymm3 \n"
@ -2596,8 +2551,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
asm("vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vbroadcastf128 %5,%%ymm7 \n"
@ -2676,8 +2630,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm5,%%xmm5 \n"
asm("pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n"
"pslld $1,%%xmm4 \n" // all 2
@ -2728,8 +2681,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pxor %%xmm7,%%xmm7 \n"
asm("pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n"
"pslld $3,%%xmm6 \n" // all 8
@ -2819,8 +2771,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
asm("vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -2868,8 +2819,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
asm("vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8

View File

@ -2551,11 +2551,12 @@ int TestHalfFloatPlane(int benchmark_width,
#if defined(__arm__)
static void EnableFlushDenormalToZero(void) {
uint32_t cw;
__asm__ __volatile__(
"vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)::"memory");
asm volatile (
"vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)
::"memory", "cc"); // Clobber List
}
#endif