Add volatile for gcc inline to avoid being removed

Bug: b/42280943
Change-Id: I4439077a92ffa6dff91d2d10accd5251b76f7544
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5671187
Reviewed-by: David Gao <davidgao@google.com>
This commit is contained in:
Frank Barchard 2024-07-01 18:18:10 -07:00
parent efd164d64e
commit 616bee5420
21 changed files with 795 additions and 610 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1888 Version: 1889
License: BSD License: BSD
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -20,7 +20,7 @@
({ \ ({ \
const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
uint32_t val_m; \ uint32_t val_m; \
asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ asm("lw %[val_m], %[psrc_lw_m] \n" \
: [val_m] "=r"(val_m) \ : [val_m] "=r"(val_m) \
: [psrc_lw_m] "m"(*psrc_lw_m)); \ : [psrc_lw_m] "m"(*psrc_lw_m)); \
val_m; \ val_m; \
@ -31,7 +31,7 @@
({ \ ({ \
const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
uint64_t val_m = 0; \ uint64_t val_m = 0; \
asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ asm("ld %[val_m], %[psrc_ld_m] \n" \
: [val_m] "=r"(val_m) \ : [val_m] "=r"(val_m) \
: [psrc_ld_m] "m"(*psrc_ld_m)); \ : [psrc_ld_m] "m"(*psrc_ld_m)); \
val_m; \ val_m; \
@ -55,7 +55,7 @@
({ \ ({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \ uint32_t val_m = (val); \
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ asm("sw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \ : [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \ : [val_m] "r"(val_m)); \
}) })
@ -65,7 +65,7 @@
({ \ ({ \
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
uint64_t val_m = (val); \ uint64_t val_m = (val); \
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ asm("sd %[val_m], %[pdst_sd_m] \n" \
: [pdst_sd_m] "=m"(*pdst_sd_m) \ : [pdst_sd_m] "=m"(*pdst_sd_m) \
: [val_m] "r"(val_m)); \ : [val_m] "r"(val_m)); \
}) })
@ -86,8 +86,7 @@
uint8_t* psrc_lw_m = (uint8_t*)(psrc); \ uint8_t* psrc_lw_m = (uint8_t*)(psrc); \
uint32_t val_lw_m; \ uint32_t val_lw_m; \
\ \
__asm__ volatile( \ asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
"lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \ "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
\ \
: [val_lw_m] "=&r"(val_lw_m) \ : [val_lw_m] "=&r"(val_lw_m) \
@ -102,8 +101,7 @@
uint8_t* psrc_ld_m = (uint8_t*)(psrc); \ uint8_t* psrc_ld_m = (uint8_t*)(psrc); \
uint64_t val_ld_m = 0; \ uint64_t val_ld_m = 0; \
\ \
__asm__ volatile( \ asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
"ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \ "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
\ \
: [val_ld_m] "=&r"(val_ld_m) \ : [val_ld_m] "=&r"(val_ld_m) \
@ -130,7 +128,7 @@
({ \ ({ \
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
uint32_t val_m = (val); \ uint32_t val_m = (val); \
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ asm("usw %[val_m], %[pdst_sw_m] \n" \
: [pdst_sw_m] "=m"(*pdst_sw_m) \ : [pdst_sw_m] "=m"(*pdst_sw_m) \
: [val_m] "r"(val_m)); \ : [val_m] "r"(val_m)); \
}) })

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1888 #define LIBYUV_VERSION 1889
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -29,7 +29,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) { int count) {
uint64_t diff; uint64_t diff;
asm("xor %3,%3 \n" asm volatile (
"xor %3,%3 \n"
"xor %%r8,%%r8 \n" "xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n" "xor %%r9,%%r9 \n"
"xor %%r10,%%r10 \n" "xor %%r10,%%r10 \n"
@ -76,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) { int count) {
uint32_t diff = 0u; uint32_t diff = 0u;
asm( asm volatile (
// Process 16 bytes per loop. // Process 16 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -120,7 +121,8 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm("movdqa %4,%%xmm2 \n" asm volatile (
"movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n" "movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n" "pxor %%xmm1,%%xmm1 \n"
@ -178,7 +180,8 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm("vbroadcastf128 %4,%%ymm2 \n" asm volatile (
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n" "vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n" "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
@ -231,7 +234,8 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm("pxor %%xmm0,%%xmm0 \n" asm volatile (
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
@ -296,7 +300,8 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash; uint32_t hash;
asm("movd %2,%%xmm0 \n" asm volatile (
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"

View File

@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
uint8_t* dst_b, uint8_t* dst_b,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
asm( asm volatile (
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm( asm volatile (
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d "movdqu (%0),%%xmm0 \n" // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm( asm volatile (
// Main loop transpose 2 blocks of 4x4. Read a column, write a row. // Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d "vmovdqu (%0),%%xmm0 \n" // a b c d

View File

@ -51,6 +51,16 @@ extern "C" {
out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
} }
void TransposeWx16_C(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
TransposeWx8_C(src, src_stride, dst, dst_stride, width);
TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
width);
}
void TransposeUVWx16_C(const uint8_t* src, void TransposeUVWx16_C(const uint8_t* src,
int src_stride, int src_stride,
uint8_t* dst_a, uint8_t* dst_a,

View File

@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this

View File

@ -27,7 +27,8 @@ void TransposeWx16_NEON(const uint8_t* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8_t* src_temp; const uint8_t* src_temp;
asm("1: \n" asm volatile (
"1: \n"
"mov %[src_temp], %[src] \n" "mov %[src_temp], %[src] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
@ -144,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm( asm volatile (
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this

File diff suppressed because it is too large Load Diff

View File

@ -2805,8 +2805,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile( asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
@ -2864,8 +2863,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile( asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
@ -2922,8 +2920,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile( asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants "vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants

View File

@ -261,7 +261,7 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0; uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000; uint16_t alpha = 0xc000;
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n" "dup v23.8h, %w[alpha] \n"
"1: \n" READYUV210 NVTORGB "1: \n" READYUV210 NVTORGB
@ -289,7 +289,7 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0; uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000; uint16_t alpha = 0xc000;
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n" "dup v23.8h, %w[alpha] \n"
"1: \n" READYUV410 NVTORGB "1: \n" READYUV410 NVTORGB
@ -313,7 +313,7 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"movi v19.8b, #255 \n" "movi v19.8b, #255 \n"
"1: \n" READYUV210 NVTORGB RGBTORGB8 "1: \n" READYUV210 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
@ -335,7 +335,7 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"movi v19.8b, #255 \n" "movi v19.8b, #255 \n"
"1: \n" READYUV410 NVTORGB RGBTORGB8 "1: \n" READYUV410 NVTORGB RGBTORGB8
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
@ -408,7 +408,7 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"1: \n" "1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
@ -433,7 +433,7 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"1: \n" "1: \n"
"ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210
"uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8
@ -591,7 +591,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb1555, uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm(YUVTORGB_SETUP asm volatile (YUVTORGB_SETUP
"movi v19.8h, #0x80, lsl #8 \n" "movi v19.8h, #0x80, lsl #8 \n"
"1: \n" // "1: \n" //
READYUV422 I4XXTORGB RGBTORGB8_TOP READYUV422 I4XXTORGB RGBTORGB8_TOP

View File

@ -139,7 +139,8 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("cnth %[vl] \n" asm volatile (
"cnth %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP "ptrue p0.b \n" YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" /* A */ "dup z19.b, #255 \n" /* A */
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
@ -181,7 +182,8 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("cnth %[vl] \n" asm volatile (
"cnth %[vl] \n"
"ptrue p0.b \n" "ptrue p0.b \n"
"dup z19.b, #255 \n" // A "dup z19.b, #255 \n" // A
YUVTORGB_SVE_SETUP YUVTORGB_SVE_SETUP
@ -229,7 +231,8 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("cnth %[vl] \n" asm volatile (
"cnth %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP "ptrue p0.b \n" YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" /* A */ "dup z19.b, #255 \n" /* A */
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
@ -273,7 +276,8 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("cnth %[vl] \n" asm volatile (
"cnth %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP "ptrue p0.b \n" YUVTORGB_SVE_SETUP
"dup z19.b, #255 \n" // A "dup z19.b, #255 \n" // A
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
@ -318,7 +322,8 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("cnth %[vl] \n" asm volatile (
"cnth %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP "ptrue p0.b \n" YUVTORGB_SVE_SETUP
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n" "b.lt 2f \n"
@ -366,7 +371,8 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("cnth %[vl] \n" asm volatile (
"cnth %[vl] \n"
"ptrue p0.b \n" YUVTORGB_SVE_SETUP "ptrue p0.b \n" YUVTORGB_SVE_SETUP
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n" "b.lt 2f \n"
@ -416,11 +422,13 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
uint32_t nv_v_start, uint32_t nv_v_start,
uint32_t nv_v_step) { uint32_t nv_v_step) {
uint64_t vl; uint64_t vl;
asm("cnth %0" : "=r"(vl)); asm volatile (
"cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1); int width_last_y = width & (vl - 1);
width_last_y = width_last_y == 0 ? vl : width_last_y; width_last_y = width_last_y == 0 ? vl : width_last_y;
int width_last_uv = width_last_y + (width_last_y & 1); int width_last_uv = width_last_y + (width_last_y & 1);
asm("ptrue p0.b \n" YUVTORGB_SVE_SETUP asm volatile (
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n" "index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n" "index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
"dup z19.b, #255 \n" // A "dup z19.b, #255 \n" // A
@ -746,7 +754,8 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
unsigned bsl_mask = 0x7e0; unsigned bsl_mask = 0x7e0;
uint64_t vl; uint64_t vl;
width *= 2; width *= 2;
asm("mov z3.h, #3 \n" asm volatile (
"mov z3.h, #3 \n"
"dup z4.h, %w[bsl_mask] \n" "dup z4.h, %w[bsl_mask] \n"
"cntb %[vl] \n" "cntb %[vl] \n"
@ -787,7 +796,8 @@ void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
unsigned bsl_mask = 0x7e0; unsigned bsl_mask = 0x7e0;
uint64_t vl; uint64_t vl;
width *= 2; width *= 2;
asm("mov z3.h, #3 \n" asm volatile (
"mov z3.h, #3 \n"
"dup z4.h, %w[bsl_mask] \n" "dup z4.h, %w[bsl_mask] \n"
"dup z2.s, %w[dither4] \n" "dup z2.s, %w[dither4] \n"
"zip1 z2.b, z2.b, z2.b \n" "zip1 z2.b, z2.b, z2.b \n"
@ -844,7 +854,8 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
uint64_t vl; uint64_t vl;
asm("mov z4.h, #0x0300 \n" asm volatile (
"mov z4.h, #0x0300 \n"
"ptrue p0.b \n" "ptrue p0.b \n"
"cnth %x[vl] \n" "cnth %x[vl] \n"
@ -912,7 +923,8 @@ void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
// Output a row of UV values, filtering 2x2 rows of AYUV. // Output a row of UV values, filtering 2x2 rows of AYUV.
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
int vl; int vl;
asm("cntb %x[vl] \n" asm volatile (
"cntb %x[vl] \n"
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n" "b.lt 2f \n"
@ -950,7 +962,8 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
// Output a row of VU values, filtering 2x2 rows of AYUV. // Output a row of VU values, filtering 2x2 rows of AYUV.
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv; const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
int vl; int vl;
asm("cntb %x[vl] \n" asm volatile (
"cntb %x[vl] \n"
"cmp %w[width], %w[vl] \n" "cmp %w[width], %w[vl] \n"
"subs %w[width], %w[width], %w[vl] \n" "subs %w[width], %w[width], %w[vl] \n"
"b.lt 2f \n" "b.lt 2f \n"
@ -990,10 +1003,12 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
uint32_t nv_v_start = 0x0003'0003U; uint32_t nv_v_start = 0x0003'0003U;
uint32_t nv_v_step = 0x0004'0004U; uint32_t nv_v_step = 0x0004'0004U;
uint64_t vl; uint64_t vl;
asm("cnth %0" : "=r"(vl)); asm volatile (
"cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1); int width_last_y = width & (vl - 1);
int width_last_uv = width_last_y + (width_last_y & 1); int width_last_uv = width_last_y + (width_last_y & 1);
asm("ptrue p0.b \n" asm volatile (
"ptrue p0.b \n"
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n" "index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n" "index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
"dup z19.b, #255 \n" // A "dup z19.b, #255 \n" // A
@ -1047,10 +1062,12 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
uint32_t nv_v_start = 0x0002'0002U; uint32_t nv_v_start = 0x0002'0002U;
uint32_t nv_v_step = 0x0004'0004U; uint32_t nv_v_step = 0x0004'0004U;
uint64_t vl; uint64_t vl;
asm("cnth %0" : "=r"(vl)); asm volatile (
"cnth %0" : "=r"(vl));
int width_last_y = width & (vl - 1); int width_last_y = width & (vl - 1);
int width_last_uv = width_last_y + (width_last_y & 1); int width_last_uv = width_last_y + (width_last_y & 1);
asm("ptrue p0.b \n" asm volatile (
"ptrue p0.b \n"
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n" "index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n" "index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
"dup z19.b, #255 \n" // A "dup z19.b, #255 \n" // A

View File

@ -193,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf1), // %1 "m"(kShuf1), // %1
"m"(kShuf2) // %2 "m"(kShuf2) // %2
); );
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm2 \n"
@ -515,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kMadd11), // %1 "m"(kMadd11), // %1
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -578,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -667,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb2), // %2 "m"(kShufAb2), // %2
"m"(kScaleAb2) // %3 "m"(kScaleAb2) // %3
); );
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n"
@ -708,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAc3), // %1 "m"(kShufAc3), // %1
"m"(kScaleAc33) // %2 "m"(kScaleAc33) // %2
); );
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n"
@ -821,7 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0 "pxor %%xmm0,%%xmm0 \n" // 0
// above line // above line
@ -1900,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
@ -1925,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1947,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1971,7 +1971,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -2153,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm(LABELALIGN asm volatile (LABELALIGN
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"