Apply clang format

Bug: None
Change-Id: I0d9db4b384144523e61ae32b6ab3f72e93a0c265
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6138934
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2025-01-02 13:20:17 -08:00
parent b5a18f9d93
commit e0040eb318
33 changed files with 1808 additions and 1804 deletions

View File

@ -67,7 +67,6 @@ static const int kCpuHasLOONGARCH = 0x20;
static const int kCpuHasLSX = 0x100;
static const int kCpuHasLASX = 0x200;
// Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags.
LIBYUV_API

View File

@ -3613,9 +3613,9 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int scale,
int width);
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width);
uint8_t* dst_y,
int scale,
int width);
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
uint8_t* dst_ptr,
int scale,

View File

@ -499,8 +499,8 @@ static inline void I422ToRGB565Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X
RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGB8TORGB565_SVE_FROM_TOP_2X
"st2h {z18.h, z19.h}, p1, [%[dst]] \n"
"99: \n"
@ -558,8 +558,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGB8TOARGB1555_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n"
@ -617,8 +617,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGB8TOARGB4444_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n"

View File

@ -29,7 +29,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) {
uint64_t diff;
asm volatile (
asm volatile(
"xor %3,%3 \n"
"xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n"
@ -77,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) {
uint32_t diff = 0u;
asm volatile (
asm volatile(
// Process 16 bytes per loop.
LABELALIGN
"1: \n"
@ -121,7 +121,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile (
asm volatile(
"movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n"
@ -180,7 +180,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile (
asm volatile(
"vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
@ -234,7 +234,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile (
asm volatile(
"pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n"
@ -300,7 +300,7 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash;
asm volatile (
asm volatile(
"movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n"

View File

@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) {
uint32_t diff;
asm volatile (
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile (
asm volatile(
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"

View File

@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff;
asm volatile (
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t sse;
asm volatile (
asm volatile(
"movi v16.16b, #0 \n"
"movi v17.16b, #0 \n"
"movi v18.16b, #0 \n"
@ -116,30 +116,30 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash = seed;
const uint32_t c16 = 0x92d9e201; // 33^16
uint32_t tmp, tmp2;
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
// count is always a multiple of 16.
// maintain two accumulators, reduce and then final sum in scalar since
// this has better performance on little cores.
"1: \n"
"ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n"
"tbl v2.16b, {v0.16b}, v18.16b \n"
"tbl v1.16b, {v0.16b}, v17.16b \n"
"tbl v0.16b, {v0.16b}, v16.16b \n"
"mul v3.4s, v3.4s, v7.4s \n"
"mul v2.4s, v2.4s, v6.4s \n"
"mla v3.4s, v1.4s, v5.4s \n"
"mla v2.4s, v0.4s, v4.4s \n"
"addv s1, v3.4s \n"
"addv s0, v2.4s \n"
"fmov %w[tmp2], s1 \n"
"fmov %w[tmp], s0 \n"
"add %w[tmp], %w[tmp], %w[tmp2] \n"
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
"b.gt 1b \n"
"1: \n"
"ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n"
"tbl v2.16b, {v0.16b}, v18.16b \n"
"tbl v1.16b, {v0.16b}, v17.16b \n"
"tbl v0.16b, {v0.16b}, v16.16b \n"
"mul v3.4s, v3.4s, v7.4s \n"
"mul v2.4s, v2.4s, v6.4s \n"
"mla v3.4s, v1.4s, v5.4s \n"
"mla v2.4s, v0.4s, v4.4s \n"
"addv s1, v3.4s \n"
"addv s0, v2.4s \n"
"fmov %w[tmp2], s1 \n"
"fmov %w[tmp], s0 \n"
"add %w[tmp], %w[tmp], %w[tmp2] \n"
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
"b.gt 1b \n"
: [hash] "+r"(hash), // %[hash]
[count] "+r"(count), // %[count]
[tmp] "=&r"(tmp), // %[tmp]
@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b,
int count) {
uint32_t diff;
asm volatile (
asm volatile(
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"
"movi v6.16b, #1 \n"
@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
int count) {
// count is guaranteed to be a multiple of 32.
uint32_t sse;
asm volatile (
asm volatile(
"movi v4.4s, #0 \n"
"movi v5.4s, #0 \n"

View File

@ -665,7 +665,7 @@ int I010ToNV12(const uint16_t* src_y,
void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
int width) = Convert16To8Row_C;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C;
uint8_t* dst_uv, int width) = MergeUVRow_C;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
height == 0) {
return -1;

View File

@ -70,9 +70,8 @@ int ConvertToARGB(const uint8_t* sample,
uint8_t* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (dst_argb == NULL || sample == NULL ||
src_width <= 0 || src_width > INT_MAX / 4 ||
crop_width <= 0 || crop_width > INT_MAX / 4 ||
if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
src_height == 0 || crop_height == 0) {
return -1;
}
@ -81,7 +80,8 @@ int ConvertToARGB(const uint8_t* sample,
}
if (need_buf) {
const uint64_t rotate_buffer_size = (uint64_t)crop_width * 4 * abs_crop_height;
const uint64_t rotate_buffer_size =
(uint64_t)crop_width * 4 * abs_crop_height;
if (rotate_buffer_size > SIZE_MAX) {
return -1; // Invalid size.
}

View File

@ -65,8 +65,9 @@ int ConvertToI420(const uint8_t* sample,
const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height;
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || src_width > INT_MAX / 4 ||
crop_width <= 0 || src_height == 0 || crop_height == 0) {
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
crop_height == 0) {
return -1;
}
@ -78,7 +79,8 @@ int ConvertToI420(const uint8_t* sample,
if (need_buf) {
int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
const uint64_t rotate_buffer_size = (uint64_t)y_size + (uint64_t)uv_size * 2;
const uint64_t rotate_buffer_size =
(uint64_t)y_size + (uint64_t)uv_size * 2;
if (rotate_buffer_size > SIZE_MAX) {
return -1; // Invalid size.
}

View File

@ -191,7 +191,8 @@ static int ARGBRotate180(const uint8_t* src_argb,
#endif
#if defined(HAS_COPYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
CopyRow =
IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
}
#endif
#if defined(HAS_COPYROW_ERMS)

View File

@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile (
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile (
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
uint8_t* dst_b,
int dst_stride_b,
int width) {
asm volatile (
asm volatile(
// Read in the data from the source pointer.
// First round of bit swap.
LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile (
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile (
asm volatile(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d

View File

@ -27,57 +27,57 @@ void TransposeWx8_NEON(const uint8_t* src,
int dst_stride,
int width) {
const uint8_t* temp;
asm volatile (
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %[width], #8 \n"
"sub %[width], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
"vld1.8 {d2}, [%[temp]], %[src_stride] \n"
"vld1.8 {d3}, [%[temp]], %[src_stride] \n"
"vld1.8 {d4}, [%[temp]], %[src_stride] \n"
"vld1.8 {d5}, [%[temp]], %[src_stride] \n"
"vld1.8 {d6}, [%[temp]], %[src_stride] \n"
"vld1.8 {d7}, [%[temp]] \n"
"add %[src], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
"vld1.8 {d2}, [%[temp]], %[src_stride] \n"
"vld1.8 {d3}, [%[temp]], %[src_stride] \n"
"vld1.8 {d4}, [%[temp]], %[src_stride] \n"
"vld1.8 {d5}, [%[temp]], %[src_stride] \n"
"vld1.8 {d6}, [%[temp]], %[src_stride] \n"
"vld1.8 {d7}, [%[temp]] \n"
"add %[src], #8 \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n"
"subs %[width], #8 \n"
"vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n"
"subs %[width], #8 \n"
"vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n"
"vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n"
"vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n"
"vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"mov %[temp], %[dst] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d6}, [%[temp]] \n"
"add %[dst], %[dst], %[dst_stride], lsl #3 \n"
"mov %[temp], %[dst] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d6}, [%[temp]] \n"
"add %[dst], %[dst], %[dst_stride], lsl #3 \n"
"bge 1b \n"
"bge 1b \n"
: [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src]
[dst] "+r"(dst), // %[dst]
@ -95,72 +95,72 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b,
int width) {
const uint8_t* temp;
asm volatile (
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %[width], #8 \n"
"sub %[width], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
"vld2.8 {d22, d23}, [%[temp]] \n"
"add %[src], #8*2 \n"
"1: \n"
"mov %[temp], %[src] \n"
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
"vld2.8 {d22, d23}, [%[temp]] \n"
"add %[src], #8*2 \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n"
"subs %[width], #8 \n"
"vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n"
"subs %[width], #8 \n"
"vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n"
"vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n"
"vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n"
"vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
"vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n"
"mov %[temp], %[dst_a] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d20}, [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_a] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d20}, [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_b] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d21}, [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"mov %[temp], %[dst_b] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d21}, [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"bge 1b \n"
"bge 1b \n"
: [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src]
[dst_a] "+r"(dst_a), // %[dst_a]
@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile (
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"

View File

@ -27,104 +27,104 @@ void TransposeWx16_NEON(const uint8_t* src,
int dst_stride,
int width) {
const uint8_t* src_temp;
asm volatile (
"1: \n"
"mov %[src_temp], %[src] \n"
asm volatile(
"1: \n"
"mov %[src_temp], %[src] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
"add %[src], %[src], #16 \n"
"add %[src], %[src], #16 \n"
// Transpose bytes within each 2x2 block.
"trn1 v0.16b, v16.16b, v17.16b \n"
"trn2 v1.16b, v16.16b, v17.16b \n"
"trn1 v2.16b, v18.16b, v19.16b \n"
"trn2 v3.16b, v18.16b, v19.16b \n"
"trn1 v4.16b, v20.16b, v21.16b \n"
"trn2 v5.16b, v20.16b, v21.16b \n"
"trn1 v6.16b, v22.16b, v23.16b \n"
"trn2 v7.16b, v22.16b, v23.16b \n"
"trn1 v8.16b, v24.16b, v25.16b \n"
"trn2 v9.16b, v24.16b, v25.16b \n"
"trn1 v10.16b, v26.16b, v27.16b \n"
"trn2 v11.16b, v26.16b, v27.16b \n"
"trn1 v12.16b, v28.16b, v29.16b \n"
"trn2 v13.16b, v28.16b, v29.16b \n"
"trn1 v14.16b, v30.16b, v31.16b \n"
"trn2 v15.16b, v30.16b, v31.16b \n"
"trn1 v0.16b, v16.16b, v17.16b \n"
"trn2 v1.16b, v16.16b, v17.16b \n"
"trn1 v2.16b, v18.16b, v19.16b \n"
"trn2 v3.16b, v18.16b, v19.16b \n"
"trn1 v4.16b, v20.16b, v21.16b \n"
"trn2 v5.16b, v20.16b, v21.16b \n"
"trn1 v6.16b, v22.16b, v23.16b \n"
"trn2 v7.16b, v22.16b, v23.16b \n"
"trn1 v8.16b, v24.16b, v25.16b \n"
"trn2 v9.16b, v24.16b, v25.16b \n"
"trn1 v10.16b, v26.16b, v27.16b \n"
"trn2 v11.16b, v26.16b, v27.16b \n"
"trn1 v12.16b, v28.16b, v29.16b \n"
"trn2 v13.16b, v28.16b, v29.16b \n"
"trn1 v14.16b, v30.16b, v31.16b \n"
"trn2 v15.16b, v30.16b, v31.16b \n"
// Transpose 2x2-byte blocks within each 4x4 block.
"trn1 v16.8h, v0.8h, v2.8h \n"
"trn1 v17.8h, v1.8h, v3.8h \n"
"trn2 v18.8h, v0.8h, v2.8h \n"
"trn2 v19.8h, v1.8h, v3.8h \n"
"trn1 v20.8h, v4.8h, v6.8h \n"
"trn1 v21.8h, v5.8h, v7.8h \n"
"trn2 v22.8h, v4.8h, v6.8h \n"
"trn2 v23.8h, v5.8h, v7.8h \n"
"trn1 v24.8h, v8.8h, v10.8h \n"
"trn1 v25.8h, v9.8h, v11.8h \n"
"trn2 v26.8h, v8.8h, v10.8h \n"
"trn2 v27.8h, v9.8h, v11.8h \n"
"trn1 v28.8h, v12.8h, v14.8h \n"
"trn1 v29.8h, v13.8h, v15.8h \n"
"trn2 v30.8h, v12.8h, v14.8h \n"
"trn2 v31.8h, v13.8h, v15.8h \n"
"trn1 v16.8h, v0.8h, v2.8h \n"
"trn1 v17.8h, v1.8h, v3.8h \n"
"trn2 v18.8h, v0.8h, v2.8h \n"
"trn2 v19.8h, v1.8h, v3.8h \n"
"trn1 v20.8h, v4.8h, v6.8h \n"
"trn1 v21.8h, v5.8h, v7.8h \n"
"trn2 v22.8h, v4.8h, v6.8h \n"
"trn2 v23.8h, v5.8h, v7.8h \n"
"trn1 v24.8h, v8.8h, v10.8h \n"
"trn1 v25.8h, v9.8h, v11.8h \n"
"trn2 v26.8h, v8.8h, v10.8h \n"
"trn2 v27.8h, v9.8h, v11.8h \n"
"trn1 v28.8h, v12.8h, v14.8h \n"
"trn1 v29.8h, v13.8h, v15.8h \n"
"trn2 v30.8h, v12.8h, v14.8h \n"
"trn2 v31.8h, v13.8h, v15.8h \n"
"subs %w[width], %w[width], #16 \n"
"subs %w[width], %w[width], #16 \n"
// Transpose 4x4-byte blocks within each 8x8 block.
"trn1 v0.4s, v16.4s, v20.4s \n"
"trn1 v2.4s, v17.4s, v21.4s \n"
"trn1 v4.4s, v18.4s, v22.4s \n"
"trn1 v6.4s, v19.4s, v23.4s \n"
"trn2 v8.4s, v16.4s, v20.4s \n"
"trn2 v10.4s, v17.4s, v21.4s \n"
"trn2 v12.4s, v18.4s, v22.4s \n"
"trn2 v14.4s, v19.4s, v23.4s \n"
"trn1 v1.4s, v24.4s, v28.4s \n"
"trn1 v3.4s, v25.4s, v29.4s \n"
"trn1 v5.4s, v26.4s, v30.4s \n"
"trn1 v7.4s, v27.4s, v31.4s \n"
"trn2 v9.4s, v24.4s, v28.4s \n"
"trn2 v11.4s, v25.4s, v29.4s \n"
"trn2 v13.4s, v26.4s, v30.4s \n"
"trn2 v15.4s, v27.4s, v31.4s \n"
"trn1 v0.4s, v16.4s, v20.4s \n"
"trn1 v2.4s, v17.4s, v21.4s \n"
"trn1 v4.4s, v18.4s, v22.4s \n"
"trn1 v6.4s, v19.4s, v23.4s \n"
"trn2 v8.4s, v16.4s, v20.4s \n"
"trn2 v10.4s, v17.4s, v21.4s \n"
"trn2 v12.4s, v18.4s, v22.4s \n"
"trn2 v14.4s, v19.4s, v23.4s \n"
"trn1 v1.4s, v24.4s, v28.4s \n"
"trn1 v3.4s, v25.4s, v29.4s \n"
"trn1 v5.4s, v26.4s, v30.4s \n"
"trn1 v7.4s, v27.4s, v31.4s \n"
"trn2 v9.4s, v24.4s, v28.4s \n"
"trn2 v11.4s, v25.4s, v29.4s \n"
"trn2 v13.4s, v26.4s, v30.4s \n"
"trn2 v15.4s, v27.4s, v31.4s \n"
// Transpose 8x8-byte blocks and store.
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
"b.gt 1b \n"
"b.gt 1b \n"
: [src] "+r"(src), // %[src]
[src_temp] "=&r"(src_temp), // %[src_temp]
[dst] "+r"(dst), // %[dst]
@ -145,76 +145,76 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b,
int width) {
const uint8_t* temp;
asm volatile (
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w[width], %w[width], #8 \n"
"sub %w[width], %w[width], #8 \n"
"1: \n"
"mov %[temp], %[src] \n"
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v7.16b}, [%[temp]] \n"
"add %[src], %[src], #16 \n"
"1: \n"
"mov %[temp], %[src] \n"
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v7.16b}, [%[temp]] \n"
"add %[src], %[src], #16 \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"subs %w[width], %w[width], #8 \n"
"subs %w[width], %w[width], #8 \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"mov %[temp], %[dst_a] \n"
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[1], [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_a] \n"
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[1], [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_b] \n"
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[1], [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"mov %[temp], %[dst_b] \n"
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[1], [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"b.ge 1b \n"
"b.ge 1b \n"
: [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src]
[dst_a] "+r"(dst_a), // %[dst_a]
@ -239,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile (
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"

File diff suppressed because it is too large Load Diff

View File

@ -2039,7 +2039,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
int width,
const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile (
asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
@ -2101,7 +2101,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
int width,
const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile (
asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
@ -2165,7 +2165,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile (
asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants

View File

@ -2807,7 +2807,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct RgbConstants* rgbconstants) {
asm volatile (
asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
@ -2866,7 +2866,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct RgbConstants* rgbconstants) {
asm volatile (
asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
@ -2924,7 +2924,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile (
asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants

View File

@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV444 YUVTORGB
@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB
RGBTORGB8
@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB
RGBTORGB8
@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB
@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB
RGBTORGB8
@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"vmov.u8 d7, #0x0f \n" // vbic bits to clear
@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUV400 YUVTORGB
@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
}
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile (
asm volatile(
"vmov.u8 d23, #255 \n"
"1: \n"
"vld1.8 {d20}, [%0]! \n"
@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READNV21 YUVTORGB RGBTORGB8
@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READNV21 YUVTORGB RGBTORGB8
@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8
@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READYUY2 YUVTORGB RGBTORGB8
@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d6, #255 \n"
"1: \n" READUYVY YUVTORGB RGBTORGB8
@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
ptrdiff_t src_tile_stride,
uint8_t* dst,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
"subs %2, %2, #16 \n" // 16 processed per loop
@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
"subs %2, %2, #16 \n" // 16 processed per loop
@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {d0, d1}, [%0], %4 \n"
"subs %3, %3, #16 \n"
@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
"pld [%0, #1792] \n"
@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
"vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
#endif
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q14}, [%0]! \n" // Load lower bits.
"vld1.8 {q9}, [%0]! \n" // Load upper bits row
@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V
@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_rgb,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G
@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_b,
uint8_t* dst_a,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_a,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q2}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G
@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_g,
uint8_t* dst_b,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 q3, #255 \n" // load A(255)
"1: \n"
"vld1.8 {q2}, [%0]! \n" // load R
@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
int depth,
int width) {
int shift = 10 - depth;
asm volatile (
asm volatile(
"vmov.u32 q14, #1023 \n"
"vdup.32 q15, %5 \n"
"1: \n"
@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
uint8_t* dst_ar30,
int /* depth */,
int width) {
asm volatile (
asm volatile(
"vmov.u32 q14, #1023 \n"
"1: \n"
"vld1.16 {d4}, [%2]! \n" // B
@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile (
asm volatile(
"vdup.u16 q15, %6 \n"
"vdup.u16 q14, %7 \n"
@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
int width) {
int shift = 16 - depth;
int mask = (1 << depth) - 1;
asm volatile (
asm volatile(
"vmov.u8 q3, #0xff \n" // A (0xffff)
"vdup.u16 q15, %5 \n"
@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
int depth,
int width) {
int shift = 8 - depth;
asm volatile (
asm volatile(
"vdup.16 q15, %6 \n"
"1: \n"
@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
int depth,
int width) {
int shift = 8 - depth;
asm volatile (
asm volatile(
"vdup.16 q15, %5 \n"
"vmov.u8 d6, #0xff \n" // A (0xff)
@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile (
asm volatile(
"vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile (
asm volatile(
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #4 \n" // 4 pixels per loop
@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
}
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile (
asm volatile(
// Start at end of source row.
"add %0, %0, %2 \n"
"sub %0, %0, #32 \n" // 32 bytes per loop
@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
}
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile (
asm volatile(
// Start at end of source row.
"mov r12, #-16 \n"
"add %0, %0, %2, lsl #1 \n"
@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
// Start at end of source row.
"mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n"
@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
}
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile (
asm volatile(
"add %0, %0, %2, lsl #2 \n"
"sub %0, #32 \n"
@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24,
int width) {
src_rgb24 += width * 3 - 24;
asm volatile (
asm volatile(
"1: \n"
"vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
"subs %2, #8 \n" // 8 pixels per loop.
@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d4, #255 \n" // Alpha
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
}
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile (
asm volatile(
"vmov.u8 d4, #255 \n" // Alpha
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
}
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile (
asm volatile(
"vmov.u8 d0, #255 \n" // Alpha
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
);
}
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d3, #255 \n" // Alpha
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
}
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
}
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
}
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // stride + src_uyvy
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
const uint8_t* shuffler,
int width) {
asm volatile (
asm volatile(
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_yuy2,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us
@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
const uint8_t* src_v,
uint8_t* dst_uyvy,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us
@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb565,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb,
uint32_t dither4,
int width) {
asm volatile (
asm volatile(
"vdup.32 d7, %2 \n" // dither4
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb1555,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d7, #0x0f \n" // bits to clear with
// vbic.
"1: \n"
@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
uint8_t* dst_a,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
@ -1839,7 +1839,7 @@ static void ARGBToUV444MatrixRow_NEON(
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile (
asm volatile(
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
@ -2367,7 +2367,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
@ -2433,7 +2433,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient
@ -2551,7 +2551,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
}
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile (
asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2577,7 +2577,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2603,7 +2603,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2629,7 +2629,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q2}, [%0]! \n"
@ -2652,7 +2652,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64,
int width) {
asm volatile (
asm volatile(
"vld1.8 {q4}, [%3] \n" // shuffler
"1: \n"
@ -2678,7 +2678,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n"
@ -2704,7 +2704,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vld1.8 {d8}, [%3] \n" // shuffler
"1: \n"
@ -2757,7 +2757,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct RgbConstants* rgbconstants) {
asm volatile (
asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n"
@ -2807,7 +2807,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct RgbConstants* rgbconstants) {
asm volatile (
asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n"
@ -2851,7 +2851,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y,
int width,
const struct RgbConstants* rgbconstants) {
asm volatile (
asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n"
@ -2903,7 +2903,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
asm volatile (
asm volatile(
"cmp %4, #0 \n"
"beq 100f \n"
"add %2, %1 \n"
@ -2965,7 +2965,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
asm volatile (
asm volatile(
"cmp %4, #0 \n"
"beq 100f \n"
"cmp %4, #128 \n"
@ -3020,7 +3020,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"subs %3, #8 \n"
"blt 89f \n"
// Blend 8 pixels.
@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up
// Attenuate 8 pixels.
@ -3108,7 +3108,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_size,
int interval_offset,
int width) {
asm volatile (
asm volatile(
"vdup.u16 q8, %2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
"vdup.u16 q9, %3 \n" // interval multiply.
@ -3150,7 +3150,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int width,
uint32_t value) {
asm volatile (
asm volatile(
"vdup.u32 q0, %3 \n" // duplicate scale value.
"vzip.u8 d0, d1 \n" // d0 aarrggbb.
"vshr.u16 q0, q0, #1 \n" // scale / 2.
@ -3184,7 +3184,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Similar to ARGBToYJ but stores ARGB.
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile (
asm volatile(
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
@ -3211,7 +3211,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile (
asm volatile(
"vmov.u8 d20, #17 \n" // BB coefficient
"vmov.u8 d21, #68 \n" // BG coefficient
"vmov.u8 d22, #35 \n" // BR coefficient
@ -3252,7 +3252,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
const int8_t* matrix_argb,
int width) {
asm volatile (
asm volatile(
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
@ -3311,7 +3311,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
// 8 pixel loop.
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -3340,7 +3340,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
// 8 pixel loop.
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -3363,7 +3363,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
// 8 pixel loop.
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -3390,7 +3390,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
@ -3415,7 +3415,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_y,
int width) {
asm volatile (
asm volatile(
// 16 pixel loop.
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
@ -3441,7 +3441,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely,
uint8_t* dst_argb,
int width) {
asm volatile (
asm volatile(
"vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop.
"1: \n"
@ -3468,7 +3468,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
const uint8_t* src_y2,
uint8_t* dst_sobelx,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n"
@ -3506,7 +3506,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
const uint8_t* src_y1,
uint8_t* dst_sobely,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n"
@ -3543,7 +3543,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst,
float scale,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts
@ -3564,11 +3564,11 @@ void HalfFloatRow_NEON(const uint16_t* src,
"vqshrn.u32 d1, q9, #13 \n"
"vqshrn.u32 d2, q10, #13 \n"
"vqshrn.u32 d3, q11, #13 \n"
"vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
"vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
}
@ -3577,7 +3577,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
float* dst,
float scale,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
@ -3606,7 +3606,7 @@ void GaussCol_NEON(const uint16_t* src0,
const uint16_t* src4,
uint32_t* dst,
int width) {
asm volatile (
asm volatile(
"vmov.u16 d6, #4 \n" // constant 4
"vmov.u16 d7, #6 \n" // constant 6
@ -3643,7 +3643,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src1 = src + 1;
const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3;
asm volatile (
asm volatile(
"vmov.u32 q10, #4 \n" // constant 4
"vmov.u32 q11, #6 \n" // constant 6
@ -3681,7 +3681,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu,
uint8_t* dst_yuv24,
int width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
@ -3705,7 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
@ -3736,7 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_vu,
int width) {
asm volatile (
asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
@ -3766,7 +3766,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
// Copy row of AYUV Y's into Y.
// Similar to ARGBExtractAlphaRow_NEON
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
@ -3782,7 +3782,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
// Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
"vld2.8 {d1, d3}, [%0]! \n"
@ -3805,7 +3805,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
int width) {
const uint8_t* src_u_1 = src_u + src_stride_u;
const uint8_t* src_v_1 = src_v + src_stride_v;
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 U values
"vld1.8 {q1}, [%2]! \n" // load 16 V values
@ -3836,7 +3836,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
int depth,
int width) {
int shift = depth - 16; // Negative for right shift.
asm volatile (
asm volatile(
"vdup.16 q2, %4 \n"
"1: \n"
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
@ -3860,7 +3860,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
int depth,
int width) {
int shift = 16 - depth;
asm volatile (
asm volatile(
"vdup.16 q2, %4 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // load 8 U
@ -3882,7 +3882,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width) {
asm volatile (
asm volatile(
"vdup.16 q2, %3 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n"
@ -3904,7 +3904,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y,
int scale,
int width) {
asm volatile (
asm volatile(
"vdup.16 d8, %3 \n"
"1: \n"
"vld1.16 {q2, q3}, [%0]! \n"
@ -3936,7 +3936,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
int scale,
int width) {
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
asm volatile (
asm volatile(
"vdup.16 q2, %3 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n"

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,7 @@ extern "C" {
// register) is set to round-to-nearest-up mode(0).
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
{ \
asm volatile ("csrwi vxrm, 0"); \
asm volatile("csrwi vxrm, 0"); \
ub = yuvconst->kUVCoeff[0]; \
vr = yuvconst->kUVCoeff[1]; \
ug = yuvconst->kUVCoeff[2]; \
@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
// To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) sets to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
if (is_yb_positive) {
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
} else {
@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
}
// To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
// Blend 50 / 50.
if (y1_fraction == 128) {
do {

View File

@ -241,7 +241,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
const int16_t* uvconstants) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
uint64_t vl;
asm volatile (
asm volatile(
"ptrue p0.b \n"
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"

View File

@ -10,8 +10,8 @@
#include "libyuv/scale.h"
#include <limits.h>
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -1233,10 +1233,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
(void)src_fourcc; // TODO(fbarchard): implement and/or assert.
(void)dst_fourcc;
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
if (!src_y || !src_u || !src_v || !dst_argb ||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 ||
dst_width <= 0 || dst_height <= 0 ||
clip_width <= 0 || clip_height <= 0) {
if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
return -1;
}
const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
@ -1250,9 +1249,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, dst_argb,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
clip_width, clip_height, filtering);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height,
dst_argb, dst_stride_argb, dst_width, dst_height, clip_x,
clip_y, clip_width, clip_height, filtering);
free(argb_buffer);
return r;
}

View File

@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
// 16 pixel loop.
LABELALIGN
"1: \n"
@ -123,7 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
@ -154,7 +154,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
@ -195,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
@ -221,7 +221,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -254,7 +254,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -297,7 +297,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n"
"pslld $0x10,%%xmm5 \n"
@ -328,7 +328,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
intptr_t stridex3;
asm volatile (
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n"
@ -383,7 +383,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n"
@ -416,7 +416,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n"
@ -481,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf1), // %1
"m"(kShuf2) // %2
);
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n"
@ -508,7 +508,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
@ -517,7 +517,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile (
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
@ -526,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kMadd11), // %1
"m"(kRound34) // %2
);
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -572,7 +572,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21
@ -581,7 +581,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1
"m"(kShuf21) // %2
);
asm volatile (
asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34
@ -591,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2
);
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -641,7 +641,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
@ -671,7 +671,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
@ -682,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb2), // %2
"m"(kScaleAb2) // %3
);
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
@ -714,7 +714,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n"
@ -724,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAc3), // %1
"m"(kScaleAc33) // %2
);
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
@ -782,7 +782,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
@ -838,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0
// above line
@ -951,7 +951,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"movdqa %3,%%xmm5 \n"
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
@ -1003,7 +1003,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n"
"psllw $3,%%xmm7 \n" // all 8
@ -1101,7 +1101,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n"
@ -1154,7 +1154,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n"
@ -1262,7 +1262,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
@ -1303,7 +1303,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
@ -1388,7 +1388,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -1432,7 +1432,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
@ -1514,7 +1514,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vbroadcastf128 %3,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
@ -1566,7 +1566,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
@ -1628,7 +1628,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -1678,7 +1678,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
@ -1761,11 +1761,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile (
"pxor %%xmm5,%%xmm5 \n"
asm volatile("pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
LABELALIGN
// 16 pixel loop.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n" // src_ptr += 16
@ -1781,11 +1780,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#ifdef HAS_SCALEADDROW_AVX2
@ -1793,10 +1792,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile (
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm3 \n"
"lea 0x20(%0),%0 \n" // src_ptr += 32
@ -1811,11 +1809,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(src_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
}
#endif // HAS_SCALEADDROW_AVX2
@ -1835,7 +1833,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int x,
int dx) {
intptr_t x0, x1, temp_pixel;
asm volatile (
asm volatile(
"movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n"
"movl $0x04040000,%k2 \n"
@ -1932,7 +1930,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) {
(void)x;
(void)dx;
asm volatile (
asm volatile(
"1: \n"
"movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
@ -1957,7 +1955,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -1979,7 +1977,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -2003,7 +2001,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride,
uint8_t* dst_argb,
int dst_width) {
asm volatile (
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
@ -2037,7 +2035,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
(void)src_stride;
asm volatile (
asm volatile(
"lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n"
@ -2074,7 +2072,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride);
asm volatile (
asm volatile(
"lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n"
"lea 0x00(%0,%5,1),%5 \n"
@ -2117,7 +2115,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int x,
int dx) {
intptr_t x0, x1;
asm volatile (
asm volatile(
"movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n"
@ -2188,7 +2186,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) {
(void)x;
(void)dx;
asm volatile (
asm volatile(
"1: \n"
"movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n"
@ -2226,7 +2224,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int x,
int dx) {
intptr_t x0, x1;
asm volatile (
asm volatile(
"movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n"
:
@ -2234,7 +2232,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"m"(kShuffleFractions) // %1
);
asm volatile (
asm volatile(
"movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n"
"pcmpeqb %%xmm6,%%xmm6 \n"
@ -2297,7 +2295,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
// Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) {
asm volatile (
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
@ -2311,7 +2309,7 @@ int FixedDiv_X86(int num, int div) {
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) {
asm volatile (
asm volatile(
"cdq \n"
"shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n"
@ -2343,7 +2341,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
@ -2383,7 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -2427,7 +2425,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
@ -2468,7 +2466,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
@ -2552,7 +2550,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -2595,7 +2593,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
@ -2675,7 +2673,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n"
@ -2727,7 +2725,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n"
@ -2818,7 +2816,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -2867,7 +2865,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile (
asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8

View File

@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
// load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n"
@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile (
asm volatile(
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride;
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile (
asm volatile(
"1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n"
@ -155,7 +155,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
@ -173,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
@ -230,7 +230,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vmov.u8 d24, #3 \n"
"add %3, %0 \n"
"1: \n"
@ -282,7 +282,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"vld1.8 {q3}, [%3] \n"
"1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
@ -306,7 +306,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile (
asm volatile(
"vld1.16 {q13}, [%5] \n"
"vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n"
@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width) {
asm volatile (
asm volatile(
"vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
@ -509,7 +509,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile (
asm volatile(
"vmov.u8 d30, #3 \n"
"1: \n"
@ -546,7 +546,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile (
asm volatile(
"vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n"
@ -608,7 +608,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile (
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
@ -644,7 +644,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile (
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
@ -695,7 +695,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile (
asm volatile(
"vmov.u16 d31, #3 \n"
"1: \n"
@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile (
asm volatile(
"vmov.u16 d31, #3 \n"
"vmov.u32 q14, #3 \n"
@ -791,7 +791,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 2;
asm volatile (
asm volatile(
"vmov.u8 d30, #3 \n"
"1: \n"
@ -828,7 +828,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile (
asm volatile(
"vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n"
@ -890,7 +890,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 2;
asm volatile (
asm volatile(
"vmov.u16 d30, #3 \n"
"1: \n"
@ -935,7 +935,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile (
asm volatile(
"vmov.u16 d30, #3 \n"
"vmov.u32 q14, #3 \n"
@ -988,7 +988,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile (
asm volatile(
"1: \n"
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
@ -1086,7 +1086,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
@ -1114,7 +1114,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile (
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
@ -1174,7 +1174,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"mov r12, %3, lsl #2 \n"
"1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n"
@ -1198,7 +1198,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
int src_stepx,
uint8_t* dst_argb,
int dst_width) {
asm volatile (
asm volatile(
"mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n"
"1: \n"
@ -1246,7 +1246,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int dx) {
int tmp;
const uint8_t* src_tmp = src_argb;
asm volatile (
asm volatile(
"1: \n"
// clang-format off
LOAD1_DATA32_LANE(d0, 0)
@ -1349,7 +1349,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
@ -1368,7 +1368,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
@ -1387,7 +1387,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile (
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
@ -1422,7 +1422,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"vld1.16 {d0[0]}, [%0], %6 \n"
"vld1.16 {d0[1]}, [%1], %6 \n"

View File

@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
// load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile (
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
@ -172,18 +172,18 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"ld1 {v29.16b}, [%[kShuf34_0]] \n"
"ld1 {v30.16b}, [%[kShuf34_1]] \n"
"ld1 {v31.16b}, [%[kShuf34_2]] \n"
"1: \n"
"ld1 {v29.16b}, [%[kShuf34_0]] \n"
"ld1 {v30.16b}, [%[kShuf34_1]] \n"
"ld1 {v31.16b}, [%[kShuf34_2]] \n"
"1: \n"
"ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
"subs %w[width], %w[width], #48 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n"
"tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n"
"tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n"
"st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
"b.gt 1b \n"
"subs %w[width], %w[width], #48 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n"
"tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n"
"tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n"
"st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
"b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(dst_width) // %[width]
@ -326,7 +326,7 @@ static const vec16 kMult38_Div664 = {
65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,
65536 / 18, 65536 / 18, 65536 / 12,
0, 0};
0, 0};
// 32 -> 12
void ScaleRowDown38_NEON(const uint8_t* src_ptr,
@ -335,26 +335,26 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"ld1 {v3.16b}, [%[kShuf38]] \n"
"subs %w[width], %w[width], #12 \n"
"b.eq 2f \n"
"ld1 {v3.16b}, [%[kShuf38]] \n"
"subs %w[width], %w[width], #12 \n"
"b.eq 2f \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]], #32 \n"
"subs %w[width], %w[width], #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead
"str q2, [%[dst_ptr]] \n"
"add %[dst_ptr], %[dst_ptr], #12 \n"
"b.gt 1b \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]], #32 \n"
"subs %w[width], %w[width], #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead
"str q2, [%[dst_ptr]] \n"
"add %[dst_ptr], %[dst_ptr], #12 \n"
"b.gt 1b \n"
// Store exactly 12 bytes on the final iteration to avoid writing past
// the end of the array.
"2: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%[dst_ptr]], #8 \n"
"st1 {v2.s}[2], [%[dst_ptr]] \n"
"2: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%[dst_ptr]], #8 \n"
"st1 {v2.s}[2], [%[dst_ptr]] \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(dst_width) // %[width]
@ -378,49 +378,49 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride;
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
asm volatile(
"ld1 {v27.16b}, [%[tblArray1]] \n"
"ld1 {v28.16b}, [%[tblArray2]] \n"
"ld1 {v29.16b}, [%[tblArray3]] \n"
"ld1 {v31.16b}, [%[tblArray4]] \n"
"ld1 {v30.16b}, [%[div996]] \n"
"ld1 {v27.16b}, [%[tblArray1]] \n"
"ld1 {v28.16b}, [%[tblArray2]] \n"
"ld1 {v29.16b}, [%[tblArray3]] \n"
"ld1 {v31.16b}, [%[tblArray4]] \n"
"ld1 {v30.16b}, [%[div996]] \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n"
"ldp q21, q1, [%[src_ptr1]], #32 \n"
"ldp q22, q2, [%[src_ptr2]], #32 \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n"
"ldp q21, q1, [%[src_ptr1]], #32 \n"
"ldp q22, q2, [%[src_ptr2]], #32 \n"
"subs %w[width], %w[width], #12 \n"
"subs %w[width], %w[width], #12 \n"
// Add across strided rows first.
"uaddl v23.8h, v20.8b, v21.8b \n"
"uaddl v3.8h, v0.8b, v1.8b \n"
"uaddl2 v24.8h, v20.16b, v21.16b \n"
"uaddl2 v4.8h, v0.16b, v1.16b \n"
"uaddl v23.8h, v20.8b, v21.8b \n"
"uaddl v3.8h, v0.8b, v1.8b \n"
"uaddl2 v24.8h, v20.16b, v21.16b \n"
"uaddl2 v4.8h, v0.16b, v1.16b \n"
"uaddw v23.8h, v23.8h, v22.8b \n"
"uaddw v3.8h, v3.8h, v2.8b \n"
"uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ...
"uaddw2 v4.8h, v4.8h, v2.16b \n"
"uaddw v23.8h, v23.8h, v22.8b \n"
"uaddw v3.8h, v3.8h, v2.8b \n"
"uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ...
"uaddw2 v4.8h, v4.8h, v2.16b \n"
// Permute groups of {three,three,two} into separate vectors to sum.
"tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ...
"tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n"
"tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ...
"tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n"
"tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0...
"tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n"
"tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ...
"tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n"
"tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ...
"tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n"
"tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0...
"tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n"
"add v23.8h, v20.8h, v21.8h \n"
"add v3.8h, v0.8h, v1.8h \n"
"add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h
"add v4.8h, v3.8h, v2.8h \n"
"add v23.8h, v20.8h, v21.8h \n"
"add v3.8h, v0.8h, v1.8h \n"
"add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h
"add v4.8h, v3.8h, v2.8h \n"
"sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6}
"sqrdmulh v25.8h, v4.8h, v30.8h \n"
"tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow.
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n"
"sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6}
"sqrdmulh v25.8h, v4.8h, v30.8h \n"
"tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow.
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
@ -446,41 +446,41 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
"ld1 {v28.16b}, [%[tblArray1]] \n"
"ld1 {v29.16b}, [%[tblArray2]] \n"
"ld1 {v31.16b}, [%[tblArray3]] \n"
"ld1 {v30.8h}, [%[div664]] \n"
"ld1 {v28.16b}, [%[tblArray1]] \n"
"ld1 {v29.16b}, [%[tblArray2]] \n"
"ld1 {v31.16b}, [%[tblArray3]] \n"
"ld1 {v30.8h}, [%[div664]] \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
"subs %w[width], %w[width], #12 \n"
"1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
"subs %w[width], %w[width], #12 \n"
// Permute into groups of six values (three pairs) to be summed.
"tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ...
"tbl v2.16b, {v0.16b}, v28.16b \n"
"tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ...
"tbl v3.16b, {v1.16b}, v28.16b \n"
"tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ...
"tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n"
"tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ...
"tbl v2.16b, {v0.16b}, v28.16b \n"
"tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ...
"tbl v3.16b, {v1.16b}, v28.16b \n"
"tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ...
"tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n"
"uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ...
"uaddlp v2.8h, v2.16b \n"
"uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ...
"uaddlp v3.8h, v3.16b \n"
"uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ...
"uaddlp v4.8h, v4.16b \n"
"add v20.8h, v22.8h, v23.8h \n"
"add v0.8h, v2.8h, v3.8h \n"
"add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ...
"add v1.8h, v0.8h, v4.8h \n"
"uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ...
"uaddlp v2.8h, v2.16b \n"
"uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ...
"uaddlp v3.8h, v3.16b \n"
"uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ...
"uaddlp v4.8h, v4.16b \n"
"add v20.8h, v22.8h, v23.8h \n"
"add v0.8h, v2.8h, v3.8h \n"
"add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ...
"add v1.8h, v0.8h, v4.8h \n"
"sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4}
"sqrdmulh v22.8h, v1.8h, v30.8h \n"
"tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow.
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n"
"sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4}
"sqrdmulh v22.8h, v1.8h, v30.8h \n"
"tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow.
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
@ -543,7 +543,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile (
asm volatile(
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
@ -599,7 +599,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile (
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
@ -636,7 +636,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile (
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
@ -690,7 +690,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile (
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
@ -735,7 +735,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile (
asm volatile(
"movi v31.4h, #3 \n"
"movi v30.4s, #3 \n"
@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 2;
asm volatile (
asm volatile(
"movi v31.8b, #3 \n"
"1: \n"
@ -829,7 +829,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile (
asm volatile(
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
@ -885,7 +885,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 2;
asm volatile (
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
@ -932,7 +932,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile (
asm volatile(
"movi v31.4h, #3 \n"
"movi v30.4s, #3 \n"
@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile (
asm volatile(
"1: \n"
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
@ -1043,14 +1043,14 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"trn1 v21.8h, v2.8h, v0.8h \n"
"1: \n" SCALE_FILTER_COLS_STEP_ADDR
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
"trn1 v4.16b, v6.16b, v0.16b \n"
@ -1090,14 +1090,14 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"1: \n"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
"subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src], 448] \n"
"uzp2 v0.4s, v0.4s, v1.4s \n"
"uzp2 v1.4s, v2.4s, v3.4s \n"
"st1 {v0.4s, v1.4s}, [%[dst]], #32 \n"
"b.gt 1b \n"
"subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src], 448] \n"
"uzp2 v0.4s, v0.4s, v1.4s \n"
"uzp2 v1.4s, v2.4s, v3.4s \n"
"st1 {v0.4s, v1.4s}, [%[dst]], #32 \n"
"b.gt 1b \n"
: [src] "+r"(src_ptr), // %[src]
[dst] "+r"(dst), // %[dst]
[width] "+r"(dst_width) // %[width]
@ -1113,15 +1113,15 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1 = src_argb + 32;
asm volatile(
"1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]] \n"
"add %[src], %[src], #64 \n"
"ld2 {v2.4s, v3.4s}, [%[src1]] \n"
"add %[src1], %[src1], #64 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v1.16b, v2.16b, v3.16b \n"
"subs %w[width], %w[width], #8 \n"
"st1 {v0.16b, v1.16b}, [%[dst]], #32 \n"
"b.gt 1b \n"
"ld2 {v0.4s, v1.4s}, [%[src]] \n"
"add %[src], %[src], #64 \n"
"ld2 {v2.4s, v3.4s}, [%[src1]] \n"
"add %[src1], %[src1], #64 \n"
"urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v1.16b, v2.16b, v3.16b \n"
"subs %w[width], %w[width], #8 \n"
"st1 {v0.16b, v1.16b}, [%[dst]], #32 \n"
"b.gt 1b \n"
: [src] "+r"(src_argb), // %[src]
[src1] "+r"(src_argb1), // %[src1]
[dst] "+r"(dst_argb), // %[dst]
@ -1135,21 +1135,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile (
"1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
"uaddl v2.8h, v0.8b, v1.8b \n"
"uaddl2 v3.8h, v0.16b, v1.16b \n"
"uaddl v22.8h, v20.8b, v21.8b \n"
"uaddl2 v23.8h, v20.16b, v21.16b \n"
"add v0.8h, v2.8h, v22.8h \n"
"add v1.8h, v3.8h, v23.8h \n"
"rshrn v0.8b, v0.8h, #2 \n"
"rshrn v1.8b, v1.8h, #2 \n"
"subs %w[width], %w[width], #4 \n"
"stp d0, d1, [%[dst]], #16 \n"
"b.gt 1b \n"
asm volatile(
"1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
"uaddl v2.8h, v0.8b, v1.8b \n"
"uaddl2 v3.8h, v0.16b, v1.16b \n"
"uaddl v22.8h, v20.8b, v21.8b \n"
"uaddl2 v23.8h, v20.16b, v21.16b \n"
"add v0.8h, v2.8h, v22.8h \n"
"add v1.8h, v3.8h, v23.8h \n"
"rshrn v0.8b, v0.8h, #2 \n"
"rshrn v1.8b, v1.8h, #2 \n"
"subs %w[width], %w[width], #4 \n"
"stp d0, d1, [%[dst]], #16 \n"
"b.gt 1b \n"
: [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst),
[width] "+r"(dst_width)
:
@ -1166,26 +1166,22 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
const uint8_t* src_argb3 = src_argb + src_stepx * 12;
int64_t i = 0;
(void)src_stride;
asm volatile (
"1: \n"
"ldr w10, [%[src], %[i]] \n"
"ldr w11, [%[src1], %[i]] \n"
"ldr w12, [%[src2], %[i]] \n"
"ldr w13, [%[src3], %[i]] \n"
"add %[i], %[i], %[step] \n"
"subs %w[width], %w[width], #4 \n"
"prfm pldl1keep, [%[src], 448] \n"
"stp w10, w11, [%[dst]], #8 \n"
"stp w12, w13, [%[dst]], #8 \n"
"b.gt 1b \n"
: [src]"+r"(src_argb),
[src1]"+r"(src_argb1),
[src2]"+r"(src_argb2),
[src3]"+r"(src_argb3),
[dst]"+r"(dst_argb),
[width]"+r"(dst_width),
[i]"+r"(i)
: [step]"r"((int64_t)(src_stepx * 16))
asm volatile(
"1: \n"
"ldr w10, [%[src], %[i]] \n"
"ldr w11, [%[src1], %[i]] \n"
"ldr w12, [%[src2], %[i]] \n"
"ldr w13, [%[src3], %[i]] \n"
"add %[i], %[i], %[step] \n"
"subs %w[width], %w[width], #4 \n"
"prfm pldl1keep, [%[src], 448] \n"
"stp w10, w11, [%[dst]], #8 \n"
"stp w12, w13, [%[dst]], #8 \n"
"b.gt 1b \n"
: [src] "+r"(src_argb), [src1] "+r"(src_argb1), [src2] "+r"(src_argb2),
[src3] "+r"(src_argb3), [dst] "+r"(dst_argb), [width] "+r"(dst_width),
[i] "+r"(i)
: [step] "r"((int64_t)(src_stepx * 16))
: "memory", "cc", "w10", "w11", "w12", "w13");
}
@ -1312,33 +1308,33 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"1: \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ldr d1, [%6] \n" //
"ldr d1, [%6] \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ldr d2, [%6] \n"
"shrn v4.4h, v5.4s, #9 \n" //
"ldr d2, [%6] \n"
"shrn v4.4h, v5.4s, #9 \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ld1 {v1.d}[1], [%6] \n" //
"ld1 {v1.d}[1], [%6] \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ld1 {v2.d}[1], [%6] \n"
"ld1 {v2.d}[1], [%6] \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop
"and v4.8b, v4.8b, v3.8b \n"
"trn1 v0.4s, v1.4s, v2.4s \n"
"tbl v4.16b, {v4.16b}, v18.16b \n" // f
"trn2 v1.4s, v1.4s, v2.4s \n"
"eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f
"subs %w2, %w2, #4 \n" // 4 processed per loop
"and v4.8b, v4.8b, v3.8b \n"
"trn1 v0.4s, v1.4s, v2.4s \n"
"tbl v4.16b, {v4.16b}, v18.16b \n" // f
"trn2 v1.4s, v1.4s, v2.4s \n"
"eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f
"umull v16.8h, v1.8b, v4.8b \n"
"umull2 v17.8h, v1.16b, v4.16b \n"
"umlal v16.8h, v0.8b, v7.8b \n"
"umlal2 v17.8h, v0.16b, v7.16b \n"
"umull v16.8h, v1.8b, v4.8b \n"
"umull2 v17.8h, v1.16b, v4.16b \n"
"umlal v16.8h, v0.8b, v7.8b \n"
"umlal2 v17.8h, v0.16b, v7.16b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"shrn v0.8b, v16.8h, #7 \n"
"shrn v1.8b, v17.8h, #7 \n"
"add v5.4s, v5.4s, v6.4s \n"
"stp d0, d1, [%0], #16 \n" // store pixels
"b.gt 1b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"shrn v0.8b, v16.8h, #7 \n"
"shrn v1.8b, v17.8h, #7 \n"
"add v5.4s, v5.4s, v6.4s \n"
"stp d0, d1, [%0], #16 \n" // store pixels
"b.gt 1b \n"
: "+r"(dst_argb), // %0
"+r"(src_argb), // %1
"+r"(dst_width), // %2
@ -1360,34 +1356,34 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n"
"subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n"
"1: \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n"
"ldp q6, q7, [%[src_ptr], #96] \n"
"add %[src_ptr], %[src_ptr], #128 \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
"stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n"
"ldp q6, q7, [%[src_ptr], #96] \n"
"add %[src_ptr], %[src_ptr], #128 \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
"stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n"
"2: \n"
"adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n"
"adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]] \n"
"ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n"
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]] \n"
"99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
@ -1403,15 +1399,15 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
"1: \n"
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
"subs %w[dst_width], %w[dst_width], #16 \n"
"urhadd v0.8h, v0.8h, v1.8h \n"
"urhadd v1.8h, v2.8h, v3.8h \n"
"prfm pldl1keep, [%[src_ptr], 448] \n"
"stp q0, q1, [%[dst_ptr]], #32 \n"
"b.gt 1b \n"
"1: \n"
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
"subs %w[dst_width], %w[dst_width], #16 \n"
"urhadd v0.8h, v0.8h, v1.8h \n"
"urhadd v1.8h, v2.8h, v3.8h \n"
"prfm pldl1keep, [%[src_ptr], 448] \n"
"stp q0, q1, [%[dst_ptr]], #32 \n"
"b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst), // %[dst_ptr]
[dst_width] "+r"(dst_width) // %[dst_width]
@ -1424,7 +1420,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst,
int dst_width) {
asm volatile (
asm volatile(
// change the stride to row 2 pointer
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n"
@ -1455,7 +1451,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -1474,7 +1470,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst,
int dst_width) {
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -1493,7 +1489,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst,
int dst_width) {
asm volatile (
asm volatile(
// change the stride to row 2 pointer
"add %1, %1, %0 \n"
"1: \n"
@ -1528,7 +1524,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride;
asm volatile (
asm volatile(
"1: \n"
"ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n"

View File

@ -10,8 +10,8 @@
#include "libyuv/scale.h" /* For FilterMode */
#include <limits.h>
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
@ -41,9 +41,9 @@ int RGBScale(const uint8_t* src_rgb,
int dst_height,
enum FilterMode filtering) {
int r;
if (!src_rgb || !dst_rgb ||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 ||
dst_width <= 0 || dst_width > INT_MAX / 4 || dst_height <= 0) {
if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
dst_height <= 0) {
return -1;
}
const int abs_src_height = (src_height < 0) ? -src_height : src_height;

View File

@ -149,7 +149,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
const uint32_t* src = (const uint32_t*)(src_argb);
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m4_t v_odd, v_even, v_dst;
vuint32m4_t v_odd_32, v_even_32;
@ -214,7 +214,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
@ -311,7 +311,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
const int stride_byte = src_stepx * 4;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
@ -389,7 +389,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
(void)src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m4_t v_s0, v_s1, v_dst;
size_t vl = __riscv_vsetvl_e8m4(w);
@ -444,7 +444,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
size_t vl = __riscv_vsetvl_e8m4(w);
vuint8m4_t v_s0, v_s1, v_t0, v_t1;
@ -577,7 +577,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint8m2_t v_t0, v_t1, v_t2, v_t3;
@ -747,7 +747,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
const uint8_t* t = src_ptr + src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
@ -876,7 +876,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
const uint8_t* t = src_ptr + src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
@ -1539,7 +1539,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
(void)src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m4_t v_u0v0, v_u1v1, v_avg;
vuint16m4_t v_u0v0_16, v_u1v1_16;
@ -1608,7 +1608,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0");
asm volatile("csrwi vxrm, 0");
do {
vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;

View File

@ -15,7 +15,6 @@ namespace libyuv {
extern "C" {
#endif
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__)

View File

@ -333,14 +333,14 @@ static void ScaleUVDownEven(int src_width,
#endif
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV)
if (TestCpuFlag(kCpuHasRVV) && !filtering) {
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
#endif
#if defined(HAS_SCALEUVROWDOWN4_RVV)
if (col_step == 4) {
ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
}
#endif
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
#endif
#if defined(HAS_SCALEUVROWDOWN4_RVV)
if (col_step == 4) {
ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
}
#endif
}
#endif

View File

@ -12,6 +12,7 @@
#include <stdlib.h>
#include <time.h>
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@ -19,7 +20,6 @@
#include "libyuv/convert_from.h"
#include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h"
#include "../unit_test/unit_test.h"
#include "libyuv/planar_functions.h"
#include "libyuv/rotate.h"
#include "libyuv/video_common.h"

View File

@ -67,16 +67,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
#endif
#ifdef __linux__
static void KernelVersion(int *version) {
static void KernelVersion(int* version) {
struct utsname buffer;
int i = 0;
version[0] = version[1] = 0;
if (uname(&buffer) == 0) {
char *v = buffer.release;
char* v = buffer.release;
for (i = 0; *v && i < 2; ++v) {
if (isdigit(*v)) {
version[i++] = (int) strtol(v, &v, 10);
version[i++] = (int)strtol(v, &v, 10);
}
}
}
@ -142,8 +142,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
// Read and print the RVV vector length.
if (has_rvv) {
register uint32_t vlenb __asm__ ("t0");
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb));
register uint32_t vlenb __asm__("t0");
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
printf("RVV vector length: %d bytes\n", vlenb);
}
}
@ -161,7 +161,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
#if defined(__loongarch__)
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
if (has_loongarch) {
int has_lsx = TestCpuFlag(kCpuHasLSX);
int has_lsx = TestCpuFlag(kCpuHasLSX);
int has_lasx = TestCpuFlag(kCpuHasLASX);
printf("Has LOONGARCH 0x%x\n", has_loongarch);
printf("Has LSX 0x%x\n", has_lsx);
@ -169,8 +169,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
}
#endif // defined(__loongarch__)
#if defined(__i386__) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_X64)
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_X64)
int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@ -215,7 +215,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
printf("Has AMXINT8 0x%x\n", has_amxint8);
}
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
// defined(_M_X64)
}
TEST_F(LibYUVBaseTest, TestCompilerMacros) {

View File

@ -1570,18 +1570,21 @@ static int TestCopyPlane(int benchmark_width,
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags);
for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert);
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width,
benchmark_width, benchmark_height * invert);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert);
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width,
benchmark_width, benchmark_height * invert);
}
int max_diff = 0;
for (int i = 0; i < y_plane_size; ++i) {
int abs_diff = abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
int abs_diff =
abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
@ -1596,29 +1599,29 @@ static int TestCopyPlane(int benchmark_width,
TEST_F(LibYUVPlanarTest, CopyPlane_Any) {
int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 0);
}
TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) {
int max_diff =
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 0);
}
TEST_F(LibYUVPlanarTest, CopyPlane_Invert) {
int max_diff =
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 0);
}
TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
int max_diff =
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 0);
}
@ -2499,17 +2502,19 @@ static int TestHalfFloatPlane(int benchmark_width,
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags);
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
scale, benchmark_width, benchmark_height * invert);
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_c),
benchmark_width * 2, scale, benchmark_width,
benchmark_height * invert);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info);
for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2,
scale, benchmark_width, benchmark_height * invert);
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_opt),
benchmark_width * 2, scale, benchmark_width,
benchmark_height * invert);
}
int max_diff = 0;
@ -2536,23 +2541,23 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
EXPECT_EQ(0, diff);
}
@ -2564,59 +2569,57 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff);
}
#if defined(__arm__)
static void EnableFlushDenormalToZero(void) {
uint32_t cw;
asm volatile (
"vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)
::"memory", "cc"); // Clobber List
asm volatile(
"vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)::"memory", "cc"); // Clobber List
}
static void DisableFlushDenormalToZero(void) {
uint32_t cw;
asm volatile (
"vmrs %0, fpscr \n"
"bic %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)
::"memory", "cc"); // Clobber List
asm volatile(
"vmrs %0, fpscr \n"
"bic %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw)::"memory", "cc"); // Clobber List
}
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
@ -2626,18 +2629,18 @@ static void DisableFlushDenormalToZero(void) {
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
// 32 bit arm rounding on denormal case is off by 1 compared to C.
EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
DisableFlushDenormalToZero();
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
int diff = TestHalfFloatPlane(
benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
DisableFlushDenormalToZero();
EXPECT_EQ(0, diff);
}
@ -3184,8 +3187,9 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
tmp_pixels_c_b, benchmark_width_, benchmark_width_,
benchmark_height_);
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c,
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
benchmark_width_, tmp_pixels_c_b, benchmark_width_,
dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r,
@ -3244,8 +3248,9 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
tmp_pixels_c_b, benchmark_width_, benchmark_width_,
benchmark_height_);
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c,
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
benchmark_width_, tmp_pixels_c_b, benchmark_width_,
dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
@ -3446,8 +3451,8 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0,
dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL,
0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
benchmark_height_);
}
@ -3502,8 +3507,8 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
for (int i = 0; i < benchmark_iterations_; ++i) {
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r,
benchmark_width_, tmp_pixels_opt_g, benchmark_width_,
tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_,
benchmark_height_);
tmp_pixels_opt_b, benchmark_width_, NULL, 0,
benchmark_width_, benchmark_height_);
}
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,

View File

@ -320,16 +320,16 @@ TEST_FACTOR(3, 1, 3)
#ifndef DISABLE_SLOW_TESTS
// Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(, name, width, height, None, 0) \
TEST_SCALETO1(, name, width, height, Linear, 3) \
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(, name, width, height, None, 0) \
TEST_SCALETO1(, name, width, height, Linear, 3) \
TEST_SCALETO1(, name, width, height, Bilinear, 3) \
TEST_SCALETO1(, name, width, height, Box, 3)
#else
#if defined(ENABLE_FULL_TESTS)
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
#define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
#else

View File

@ -1058,7 +1058,7 @@ TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 1280, 720)
TEST_SCALETO(Scale, 1920, 1080)
TEST_SCALETO(Scale, 1080, 1920) // for rotated phones
#endif // DISABLE_SLOW_TESTS
#endif // DISABLE_SLOW_TESTS
#undef TEST_SCALETO1
#undef TEST_SCALETO