Apply clang format

Bug: None
Change-Id: I0d9db4b384144523e61ae32b6ab3f72e93a0c265
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6138934
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2025-01-02 13:20:17 -08:00
parent b5a18f9d93
commit e0040eb318
33 changed files with 1808 additions and 1804 deletions

View File

@ -67,7 +67,6 @@ static const int kCpuHasLOONGARCH = 0x20;
static const int kCpuHasLSX = 0x100; static const int kCpuHasLSX = 0x100;
static const int kCpuHasLASX = 0x200; static const int kCpuHasLASX = 0x200;
// Optional init function. TestCpuFlag does an auto-init. // Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags. // Returns cpu_info flags.
LIBYUV_API LIBYUV_API

View File

@ -3613,9 +3613,9 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
int scale, int scale,
int width); int width);
void Convert16To8Row_AVX512BW(const uint16_t* src_y, void Convert16To8Row_AVX512BW(const uint16_t* src_y,
uint8_t* dst_y, uint8_t* dst_y,
int scale, int scale,
int width); int width);
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int scale, int scale,

View File

@ -499,8 +499,8 @@ static inline void I422ToRGB565Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail. // Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n" "cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" // "whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
"st2h {z18.h, z19.h}, p1, [%[dst]] \n" "st2h {z18.h, z19.h}, p1, [%[dst]] \n"
"99: \n" "99: \n"
@ -558,8 +558,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail. // Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n" "cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" // "whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n" "st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n" "99: \n"
@ -617,8 +617,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail. // Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n" "cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" // "whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n" "st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n" "99: \n"

View File

@ -29,7 +29,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) { int count) {
uint64_t diff; uint64_t diff;
asm volatile ( asm volatile(
"xor %3,%3 \n" "xor %3,%3 \n"
"xor %%r8,%%r8 \n" "xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n" "xor %%r9,%%r9 \n"
@ -77,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) { int count) {
uint32_t diff = 0u; uint32_t diff = 0u;
asm volatile ( asm volatile(
// Process 16 bytes per loop. // Process 16 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -121,7 +121,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"movdqa %4,%%xmm2 \n" "movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n" "movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
@ -180,7 +180,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n" "vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
@ -234,7 +234,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
@ -300,7 +300,7 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash; uint32_t hash;
asm volatile ( asm volatile(
"movd %2,%%xmm0 \n" "movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"

View File

@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"vmov.u16 q4, #0 \n" // accumulator "vmov.u16 q4, #0 \n" // accumulator
"1: \n" "1: \n"
@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"vmov.u8 q8, #0 \n" "vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n" "vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"

View File

@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"movi v4.8h, #0 \n" "movi v4.8h, #0 \n"
"1: \n" "1: \n"
@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"movi v16.16b, #0 \n" "movi v16.16b, #0 \n"
"movi v17.16b, #0 \n" "movi v17.16b, #0 \n"
"movi v18.16b, #0 \n" "movi v18.16b, #0 \n"
@ -116,30 +116,30 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash = seed; uint32_t hash = seed;
const uint32_t c16 = 0x92d9e201; // 33^16 const uint32_t c16 = 0x92d9e201; // 33^16
uint32_t tmp, tmp2; uint32_t tmp, tmp2;
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
// count is always a multiple of 16. // count is always a multiple of 16.
// maintain two accumulators, reduce and then final sum in scalar since // maintain two accumulators, reduce and then final sum in scalar since
// this has better performance on little cores. // this has better performance on little cores.
"1: \n" "1: \n"
"ldr q0, [%[src]], #16 \n" "ldr q0, [%[src]], #16 \n"
"subs %w[count], %w[count], #16 \n" "subs %w[count], %w[count], #16 \n"
"tbl v3.16b, {v0.16b}, v19.16b \n" "tbl v3.16b, {v0.16b}, v19.16b \n"
"tbl v2.16b, {v0.16b}, v18.16b \n" "tbl v2.16b, {v0.16b}, v18.16b \n"
"tbl v1.16b, {v0.16b}, v17.16b \n" "tbl v1.16b, {v0.16b}, v17.16b \n"
"tbl v0.16b, {v0.16b}, v16.16b \n" "tbl v0.16b, {v0.16b}, v16.16b \n"
"mul v3.4s, v3.4s, v7.4s \n" "mul v3.4s, v3.4s, v7.4s \n"
"mul v2.4s, v2.4s, v6.4s \n" "mul v2.4s, v2.4s, v6.4s \n"
"mla v3.4s, v1.4s, v5.4s \n" "mla v3.4s, v1.4s, v5.4s \n"
"mla v2.4s, v0.4s, v4.4s \n" "mla v2.4s, v0.4s, v4.4s \n"
"addv s1, v3.4s \n" "addv s1, v3.4s \n"
"addv s0, v2.4s \n" "addv s0, v2.4s \n"
"fmov %w[tmp2], s1 \n" "fmov %w[tmp2], s1 \n"
"fmov %w[tmp], s0 \n" "fmov %w[tmp], s0 \n"
"add %w[tmp], %w[tmp], %w[tmp2] \n" "add %w[tmp], %w[tmp], %w[tmp2] \n"
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n" "madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
"b.gt 1b \n" "b.gt 1b \n"
: [hash] "+r"(hash), // %[hash] : [hash] "+r"(hash), // %[hash]
[count] "+r"(count), // %[count] [count] "+r"(count), // %[count]
[tmp] "=&r"(tmp), // %[tmp] [tmp] "=&r"(tmp), // %[tmp]
@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"movi v4.4s, #0 \n" "movi v4.4s, #0 \n"
"movi v5.4s, #0 \n" "movi v5.4s, #0 \n"
"movi v6.16b, #1 \n" "movi v6.16b, #1 \n"
@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
int count) { int count) {
// count is guaranteed to be a multiple of 32. // count is guaranteed to be a multiple of 32.
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"movi v4.4s, #0 \n" "movi v4.4s, #0 \n"
"movi v5.4s, #0 \n" "movi v5.4s, #0 \n"

View File

@ -665,7 +665,7 @@ int I010ToNV12(const uint16_t* src_y,
void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
int width) = Convert16To8Row_C; int width) = Convert16To8Row_C;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C; uint8_t* dst_uv, int width) = MergeUVRow_C;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 || if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
height == 0) { height == 0) {
return -1; return -1;

View File

@ -70,9 +70,8 @@ int ConvertToARGB(const uint8_t* sample,
uint8_t* rotate_buffer = NULL; uint8_t* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (dst_argb == NULL || sample == NULL || if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
src_width <= 0 || src_width > INT_MAX / 4 || src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
crop_width <= 0 || crop_width > INT_MAX / 4 ||
src_height == 0 || crop_height == 0) { src_height == 0 || crop_height == 0) {
return -1; return -1;
} }
@ -81,7 +80,8 @@ int ConvertToARGB(const uint8_t* sample,
} }
if (need_buf) { if (need_buf) {
const uint64_t rotate_buffer_size = (uint64_t)crop_width * 4 * abs_crop_height; const uint64_t rotate_buffer_size =
(uint64_t)crop_width * 4 * abs_crop_height;
if (rotate_buffer_size > SIZE_MAX) { if (rotate_buffer_size > SIZE_MAX) {
return -1; // Invalid size. return -1; // Invalid size.
} }

View File

@ -65,8 +65,9 @@ int ConvertToI420(const uint8_t* sample,
const int inv_crop_height = const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height; (src_height < 0) ? -abs_crop_height : abs_crop_height;
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || src_width > INT_MAX / 4 || if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
crop_width <= 0 || src_height == 0 || crop_height == 0) { src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
crop_height == 0) {
return -1; return -1;
} }
@ -78,7 +79,8 @@ int ConvertToI420(const uint8_t* sample,
if (need_buf) { if (need_buf) {
int y_size = crop_width * abs_crop_height; int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
const uint64_t rotate_buffer_size = (uint64_t)y_size + (uint64_t)uv_size * 2; const uint64_t rotate_buffer_size =
(uint64_t)y_size + (uint64_t)uv_size * 2;
if (rotate_buffer_size > SIZE_MAX) { if (rotate_buffer_size > SIZE_MAX) {
return -1; // Invalid size. return -1; // Invalid size.
} }

View File

@ -191,7 +191,8 @@ static int ARGBRotate180(const uint8_t* src_argb,
#endif #endif
#if defined(HAS_COPYROW_AVX512BW) #if defined(HAS_COPYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) { if (TestCpuFlag(kCpuHasAVX512BW)) {
CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; CopyRow =
IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)

View File

@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
uint8_t* dst_b, uint8_t* dst_b,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
asm volatile ( asm volatile(
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d "movdqu (%0),%%xmm0 \n" // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row. // Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d "vmovdqu (%0),%%xmm0 \n" // a b c d

View File

@ -27,57 +27,57 @@ void TransposeWx8_NEON(const uint8_t* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %[width], #8 \n" "sub %[width], #8 \n"
"1: \n" "1: \n"
"mov %[temp], %[src] \n" "mov %[temp], %[src] \n"
"vld1.8 {d0}, [%[temp]], %[src_stride] \n" "vld1.8 {d0}, [%[temp]], %[src_stride] \n"
"vld1.8 {d1}, [%[temp]], %[src_stride] \n" "vld1.8 {d1}, [%[temp]], %[src_stride] \n"
"vld1.8 {d2}, [%[temp]], %[src_stride] \n" "vld1.8 {d2}, [%[temp]], %[src_stride] \n"
"vld1.8 {d3}, [%[temp]], %[src_stride] \n" "vld1.8 {d3}, [%[temp]], %[src_stride] \n"
"vld1.8 {d4}, [%[temp]], %[src_stride] \n" "vld1.8 {d4}, [%[temp]], %[src_stride] \n"
"vld1.8 {d5}, [%[temp]], %[src_stride] \n" "vld1.8 {d5}, [%[temp]], %[src_stride] \n"
"vld1.8 {d6}, [%[temp]], %[src_stride] \n" "vld1.8 {d6}, [%[temp]], %[src_stride] \n"
"vld1.8 {d7}, [%[temp]] \n" "vld1.8 {d7}, [%[temp]] \n"
"add %[src], #8 \n" "add %[src], #8 \n"
"vtrn.8 d1, d0 \n" "vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n" "vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n" "vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n" "vtrn.8 d7, d6 \n"
"subs %[width], #8 \n" "subs %[width], #8 \n"
"vtrn.16 d1, d3 \n" "vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n" "vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n" "vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n" "vtrn.16 d4, d6 \n"
"vtrn.32 d1, d5 \n" "vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n" "vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n" "vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n" "vtrn.32 d2, d6 \n"
"vrev16.8 q0, q0 \n" "vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n" "vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n" "vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n" "vrev16.8 q3, q3 \n"
"mov %[temp], %[dst] \n" "mov %[temp], %[dst] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n" "vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n" "vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n" "vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n" "vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n" "vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n" "vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n" "vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
"vst1.8 {d6}, [%[temp]] \n" "vst1.8 {d6}, [%[temp]] \n"
"add %[dst], %[dst], %[dst_stride], lsl #3 \n" "add %[dst], %[dst], %[dst_stride], lsl #3 \n"
"bge 1b \n" "bge 1b \n"
: [temp] "=&r"(temp), // %[temp] : [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src] [src] "+r"(src), // %[src]
[dst] "+r"(dst), // %[dst] [dst] "+r"(dst), // %[dst]
@ -95,72 +95,72 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %[width], #8 \n" "sub %[width], #8 \n"
"1: \n" "1: \n"
"mov %[temp], %[src] \n" "mov %[temp], %[src] \n"
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n" "vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n" "vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n" "vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n" "vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n" "vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n" "vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n" "vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
"vld2.8 {d22, d23}, [%[temp]] \n" "vld2.8 {d22, d23}, [%[temp]] \n"
"add %[src], #8*2 \n" "add %[src], #8*2 \n"
"vtrn.8 q1, q0 \n" "vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n" "vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n" "vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n" "vtrn.8 q11, q10 \n"
"subs %[width], #8 \n" "subs %[width], #8 \n"
"vtrn.16 q1, q3 \n" "vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n" "vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n" "vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n" "vtrn.16 q8, q10 \n"
"vtrn.32 q1, q9 \n" "vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n" "vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n" "vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n" "vtrn.32 q2, q10 \n"
"vrev16.8 q0, q0 \n" "vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n" "vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n" "vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n" "vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n" "vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n" "vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n" "vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n" "vrev16.8 q11, q11 \n"
"mov %[temp], %[dst_a] \n" "mov %[temp], %[dst_a] \n"
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n" "vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
"vst1.8 {d20}, [%[temp]] \n" "vst1.8 {d20}, [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_b] \n" "mov %[temp], %[dst_b] \n"
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n" "vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
"vst1.8 {d21}, [%[temp]] \n" "vst1.8 {d21}, [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"bge 1b \n" "bge 1b \n"
: [temp] "=&r"(temp), // %[temp] : [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src] [src] "+r"(src), // %[src]
[dst_a] "+r"(dst_a), // %[dst_a] [dst_a] "+r"(dst_a), // %[dst_a]
@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst1 = dst + dst_stride; uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride; uint8_t* dst3 = dst2 + dst_stride;
asm volatile ( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"

View File

@ -27,104 +27,104 @@ void TransposeWx16_NEON(const uint8_t* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8_t* src_temp; const uint8_t* src_temp;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"mov %[src_temp], %[src] \n" "mov %[src_temp], %[src] \n"
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n" "ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
"add %[src], %[src], #16 \n" "add %[src], %[src], #16 \n"
// Transpose bytes within each 2x2 block. // Transpose bytes within each 2x2 block.
"trn1 v0.16b, v16.16b, v17.16b \n" "trn1 v0.16b, v16.16b, v17.16b \n"
"trn2 v1.16b, v16.16b, v17.16b \n" "trn2 v1.16b, v16.16b, v17.16b \n"
"trn1 v2.16b, v18.16b, v19.16b \n" "trn1 v2.16b, v18.16b, v19.16b \n"
"trn2 v3.16b, v18.16b, v19.16b \n" "trn2 v3.16b, v18.16b, v19.16b \n"
"trn1 v4.16b, v20.16b, v21.16b \n" "trn1 v4.16b, v20.16b, v21.16b \n"
"trn2 v5.16b, v20.16b, v21.16b \n" "trn2 v5.16b, v20.16b, v21.16b \n"
"trn1 v6.16b, v22.16b, v23.16b \n" "trn1 v6.16b, v22.16b, v23.16b \n"
"trn2 v7.16b, v22.16b, v23.16b \n" "trn2 v7.16b, v22.16b, v23.16b \n"
"trn1 v8.16b, v24.16b, v25.16b \n" "trn1 v8.16b, v24.16b, v25.16b \n"
"trn2 v9.16b, v24.16b, v25.16b \n" "trn2 v9.16b, v24.16b, v25.16b \n"
"trn1 v10.16b, v26.16b, v27.16b \n" "trn1 v10.16b, v26.16b, v27.16b \n"
"trn2 v11.16b, v26.16b, v27.16b \n" "trn2 v11.16b, v26.16b, v27.16b \n"
"trn1 v12.16b, v28.16b, v29.16b \n" "trn1 v12.16b, v28.16b, v29.16b \n"
"trn2 v13.16b, v28.16b, v29.16b \n" "trn2 v13.16b, v28.16b, v29.16b \n"
"trn1 v14.16b, v30.16b, v31.16b \n" "trn1 v14.16b, v30.16b, v31.16b \n"
"trn2 v15.16b, v30.16b, v31.16b \n" "trn2 v15.16b, v30.16b, v31.16b \n"
// Transpose 2x2-byte blocks within each 4x4 block. // Transpose 2x2-byte blocks within each 4x4 block.
"trn1 v16.8h, v0.8h, v2.8h \n" "trn1 v16.8h, v0.8h, v2.8h \n"
"trn1 v17.8h, v1.8h, v3.8h \n" "trn1 v17.8h, v1.8h, v3.8h \n"
"trn2 v18.8h, v0.8h, v2.8h \n" "trn2 v18.8h, v0.8h, v2.8h \n"
"trn2 v19.8h, v1.8h, v3.8h \n" "trn2 v19.8h, v1.8h, v3.8h \n"
"trn1 v20.8h, v4.8h, v6.8h \n" "trn1 v20.8h, v4.8h, v6.8h \n"
"trn1 v21.8h, v5.8h, v7.8h \n" "trn1 v21.8h, v5.8h, v7.8h \n"
"trn2 v22.8h, v4.8h, v6.8h \n" "trn2 v22.8h, v4.8h, v6.8h \n"
"trn2 v23.8h, v5.8h, v7.8h \n" "trn2 v23.8h, v5.8h, v7.8h \n"
"trn1 v24.8h, v8.8h, v10.8h \n" "trn1 v24.8h, v8.8h, v10.8h \n"
"trn1 v25.8h, v9.8h, v11.8h \n" "trn1 v25.8h, v9.8h, v11.8h \n"
"trn2 v26.8h, v8.8h, v10.8h \n" "trn2 v26.8h, v8.8h, v10.8h \n"
"trn2 v27.8h, v9.8h, v11.8h \n" "trn2 v27.8h, v9.8h, v11.8h \n"
"trn1 v28.8h, v12.8h, v14.8h \n" "trn1 v28.8h, v12.8h, v14.8h \n"
"trn1 v29.8h, v13.8h, v15.8h \n" "trn1 v29.8h, v13.8h, v15.8h \n"
"trn2 v30.8h, v12.8h, v14.8h \n" "trn2 v30.8h, v12.8h, v14.8h \n"
"trn2 v31.8h, v13.8h, v15.8h \n" "trn2 v31.8h, v13.8h, v15.8h \n"
"subs %w[width], %w[width], #16 \n" "subs %w[width], %w[width], #16 \n"
// Transpose 4x4-byte blocks within each 8x8 block. // Transpose 4x4-byte blocks within each 8x8 block.
"trn1 v0.4s, v16.4s, v20.4s \n" "trn1 v0.4s, v16.4s, v20.4s \n"
"trn1 v2.4s, v17.4s, v21.4s \n" "trn1 v2.4s, v17.4s, v21.4s \n"
"trn1 v4.4s, v18.4s, v22.4s \n" "trn1 v4.4s, v18.4s, v22.4s \n"
"trn1 v6.4s, v19.4s, v23.4s \n" "trn1 v6.4s, v19.4s, v23.4s \n"
"trn2 v8.4s, v16.4s, v20.4s \n" "trn2 v8.4s, v16.4s, v20.4s \n"
"trn2 v10.4s, v17.4s, v21.4s \n" "trn2 v10.4s, v17.4s, v21.4s \n"
"trn2 v12.4s, v18.4s, v22.4s \n" "trn2 v12.4s, v18.4s, v22.4s \n"
"trn2 v14.4s, v19.4s, v23.4s \n" "trn2 v14.4s, v19.4s, v23.4s \n"
"trn1 v1.4s, v24.4s, v28.4s \n" "trn1 v1.4s, v24.4s, v28.4s \n"
"trn1 v3.4s, v25.4s, v29.4s \n" "trn1 v3.4s, v25.4s, v29.4s \n"
"trn1 v5.4s, v26.4s, v30.4s \n" "trn1 v5.4s, v26.4s, v30.4s \n"
"trn1 v7.4s, v27.4s, v31.4s \n" "trn1 v7.4s, v27.4s, v31.4s \n"
"trn2 v9.4s, v24.4s, v28.4s \n" "trn2 v9.4s, v24.4s, v28.4s \n"
"trn2 v11.4s, v25.4s, v29.4s \n" "trn2 v11.4s, v25.4s, v29.4s \n"
"trn2 v13.4s, v26.4s, v30.4s \n" "trn2 v13.4s, v26.4s, v30.4s \n"
"trn2 v15.4s, v27.4s, v31.4s \n" "trn2 v15.4s, v27.4s, v31.4s \n"
// Transpose 8x8-byte blocks and store. // Transpose 8x8-byte blocks and store.
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n" "st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n" "st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src] "+r"(src), // %[src] : [src] "+r"(src), // %[src]
[src_temp] "=&r"(src_temp), // %[src_temp] [src_temp] "=&r"(src_temp), // %[src_temp]
[dst] "+r"(dst), // %[dst] [dst] "+r"(dst), // %[dst]
@ -145,76 +145,76 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w[width], %w[width], #8 \n" "sub %w[width], %w[width], #8 \n"
"1: \n" "1: \n"
"mov %[temp], %[src] \n" "mov %[temp], %[src] \n"
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n" "ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n" "ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n" "ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n" "ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n" "ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n" "ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n" "ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
"ld1 {v7.16b}, [%[temp]] \n" "ld1 {v7.16b}, [%[temp]] \n"
"add %[src], %[src], #16 \n" "add %[src], %[src], #16 \n"
"trn1 v16.16b, v0.16b, v1.16b \n" "trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n" "trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n" "trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n" "trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n" "trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n" "trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n" "trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n" "trn2 v23.16b, v6.16b, v7.16b \n"
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"trn1 v0.8h, v16.8h, v18.8h \n" "trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n" "trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n" "trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n" "trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n" "trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n" "trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n" "trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n" "trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n" "trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n" "trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n" "trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n" "trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n" "trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n" "trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n" "trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n" "trn2 v23.4s, v5.4s, v7.4s \n"
"mov %[temp], %[dst_a] \n" "mov %[temp], %[dst_a] \n"
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n" "st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n" "st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n" "st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n" "st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n" "st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n" "st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n" "st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
"st1 {v19.d}[1], [%[temp]] \n" "st1 {v19.d}[1], [%[temp]] \n"
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
"mov %[temp], %[dst_b] \n" "mov %[temp], %[dst_b] \n"
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n" "st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n" "st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n" "st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n" "st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n" "st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n" "st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n" "st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
"st1 {v23.d}[1], [%[temp]] \n" "st1 {v23.d}[1], [%[temp]] \n"
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
"b.ge 1b \n" "b.ge 1b \n"
: [temp] "=&r"(temp), // %[temp] : [temp] "=&r"(temp), // %[temp]
[src] "+r"(src), // %[src] [src] "+r"(src), // %[src]
[dst_a] "+r"(dst_a), // %[dst_a] [dst_a] "+r"(dst_a), // %[dst_a]
@ -239,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst1 = dst + dst_stride; uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride; uint8_t* dst3 = dst2 + dst_stride;
asm volatile ( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"

File diff suppressed because it is too large Load Diff

View File

@ -2039,7 +2039,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile ( asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
@ -2101,7 +2101,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile ( asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
@ -2165,7 +2165,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile ( asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants

View File

@ -2807,7 +2807,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
@ -2866,7 +2866,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
@ -2924,7 +2924,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile ( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants

View File

@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
uint8_t* dst_rgba, uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565, uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb1555, uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb4444, uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"vmov.u8 d7, #0x0f \n" // vbic bits to clear "vmov.u8 d7, #0x0f \n" // vbic bits to clear
@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV400 YUVTORGB "1: \n" READYUV400 YUVTORGB
@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
} }
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" "1: \n"
"vld1.8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8 "1: \n" READNV12 YUVTORGB RGBTORGB8
@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV21 YUVTORGB RGBTORGB8 "1: \n" READNV21 YUVTORGB RGBTORGB8
@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8 "1: \n" READNV12 YUVTORGB RGBTORGB8
@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV21 YUVTORGB RGBTORGB8 "1: \n" READNV21 YUVTORGB RGBTORGB8
@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565, uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8 "1: \n" READNV12 YUVTORGB RGBTORGB8
@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUY2 YUVTORGB RGBTORGB8 "1: \n" READYUY2 YUVTORGB RGBTORGB8
@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READUYVY YUVTORGB RGBTORGB8 "1: \n" READUYVY YUVTORGB RGBTORGB8
@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
ptrdiff_t src_tile_stride, ptrdiff_t src_tile_stride,
uint8_t* dst, uint8_t* dst,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride, ptrdiff_t src_tile_stride,
uint16_t* dst, uint16_t* dst,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d1}, [%0], %4 \n" "vld2.8 {d0, d1}, [%0], %4 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride, ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
"pld [%0, #1792] \n" "pld [%0, #1792] \n"
@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride, ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
"vld1.8 {q1}, [%1], %5 \n" // Load 8 UV "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
#endif #endif
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q14}, [%0]! \n" // Load lower bits. "vld1.8 {q14}, [%0]! \n" // Load lower bits.
"vld1.8 {q9}, [%0]! \n" // Load upper bits row "vld1.8 {q9}, [%0]! \n" // Load upper bits row
@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V "vld1.8 {q1}, [%1]! \n" // load V
@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_rgb, uint8_t* dst_rgb,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load R "vld1.8 {q0}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q1}, [%1]! \n" // load G
@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_b, uint8_t* dst_b,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_a, const uint8_t* src_a,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q2}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q1}, [%1]! \n" // load G
@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 q3, #255 \n" // load A(255) "vmov.u8 q3, #255 \n" // load A(255)
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q2}, [%0]! \n" // load R
@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
int depth, int depth,
int width) { int width) {
int shift = 10 - depth; int shift = 10 - depth;
asm volatile ( asm volatile(
"vmov.u32 q14, #1023 \n" "vmov.u32 q14, #1023 \n"
"vdup.32 q15, %5 \n" "vdup.32 q15, %5 \n"
"1: \n" "1: \n"
@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
uint8_t* dst_ar30, uint8_t* dst_ar30,
int /* depth */, int /* depth */,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u32 q14, #1023 \n" "vmov.u32 q14, #1023 \n"
"1: \n" "1: \n"
"vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d4}, [%2]! \n" // B
@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
int width) { int width) {
int shift = 16 - depth; int shift = 16 - depth;
int mask = (1 << depth) - 1; int mask = (1 << depth) - 1;
asm volatile ( asm volatile(
"vdup.u16 q15, %6 \n" "vdup.u16 q15, %6 \n"
"vdup.u16 q14, %7 \n" "vdup.u16 q14, %7 \n"
@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
int width) { int width) {
int shift = 16 - depth; int shift = 16 - depth;
int mask = (1 << depth) - 1; int mask = (1 << depth) - 1;
asm volatile ( asm volatile(
"vmov.u8 q3, #0xff \n" // A (0xffff) "vmov.u8 q3, #0xff \n" // A (0xffff)
"vdup.u16 q15, %5 \n" "vdup.u16 q15, %5 \n"
@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
int depth, int depth,
int width) { int width) {
int shift = 8 - depth; int shift = 8 - depth;
asm volatile ( asm volatile(
"vdup.16 q15, %6 \n" "vdup.16 q15, %6 \n"
"1: \n" "1: \n"
@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
int depth, int depth,
int width) { int width) {
int shift = 8 - depth; int shift = 8 - depth;
asm volatile ( asm volatile(
"vdup.16 q15, %5 \n" "vdup.16 q15, %5 \n"
"vmov.u8 d6, #0xff \n" // A (0xff) "vmov.u8 d6, #0xff \n" // A (0xff)
@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated. // SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile ( asm volatile(
"vdup.8 q0, %2 \n" // duplicate 16 bytes "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile ( asm volatile(
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #4 \n" // 4 pixels per loop "subs %1, %1, #4 \n" // 4 pixels per loop
@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
} }
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile ( asm volatile(
// Start at end of source row. // Start at end of source row.
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, %0, #32 \n" // 32 bytes per loop "sub %0, %0, #32 \n" // 32 bytes per loop
@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
} }
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile ( asm volatile(
// Start at end of source row. // Start at end of source row.
"mov r12, #-16 \n" "mov r12, #-16 \n"
"add %0, %0, %2, lsl #1 \n" "add %0, %0, %2, lsl #1 \n"
@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
// Start at end of source row. // Start at end of source row.
"mov r12, #-16 \n" "mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
} }
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, #32 \n" "sub %0, #32 \n"
@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
int width) { int width) {
src_rgb24 += width * 3 - 24; src_rgb24 += width * 3 - 24;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
"subs %2, #8 \n" // 8 pixels per loop. "subs %2, #8 \n" // 8 pixels per loop.
@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
} }
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
} }
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile ( asm volatile(
"vmov.u8 d0, #255 \n" // Alpha "vmov.u8 d0, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
); );
} }
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
} }
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
} }
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
} }
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // stride + src_uyvy "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2, int stride_yuy2,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const uint8_t* shuffler, const uint8_t* shuffler,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {q2}, [%3] \n" // shuffler "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 4 pixels. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d1}, [%1]! \n" // load 8 Us
@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uyvy, uint8_t* dst_uyvy,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d0}, [%1]! \n" // load 8 Us
@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb565, uint8_t* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb, uint8_t* dst_rgb,
uint32_t dither4, uint32_t dither4,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.32 d7, %2 \n" // dither4 "vdup.32 d7, %2 \n" // dither4
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb1555, uint8_t* dst_argb1555,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444, uint8_t* dst_argb4444,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d7, #0x0f \n" // bits to clear with "vmov.u8 d7, #0x0f \n" // bits to clear with
// vbic. // vbic.
"1: \n" "1: \n"
@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
@ -1839,7 +1839,7 @@ static void ARGBToUV444MatrixRow_NEON(
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants "vld1.8 {d0}, [%4] \n" // load rgbuvconstants
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
@ -2367,7 +2367,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_argb "add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient // coefficient
@ -2433,7 +2433,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_argb "add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient // coefficient
@ -2551,7 +2551,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
} }
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2577,7 +2577,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2603,7 +2603,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2629,7 +2629,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void ARGBToAR64Row_NEON(const uint8_t* src_argb, void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64, uint16_t* dst_ar64,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
"vld1.8 {q2}, [%0]! \n" "vld1.8 {q2}, [%0]! \n"
@ -2652,7 +2652,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
void ARGBToAB64Row_NEON(const uint8_t* src_argb, void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64, uint16_t* dst_ab64,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {q4}, [%3] \n" // shuffler "vld1.8 {q4}, [%3] \n" // shuffler
"1: \n" "1: \n"
@ -2678,7 +2678,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
void AR64ToARGBRow_NEON(const uint16_t* src_ar64, void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" "vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n" "vld1.16 {q1}, [%0]! \n"
@ -2704,7 +2704,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
void AB64ToARGBRow_NEON(const uint16_t* src_ab64, void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {d8}, [%3] \n" // shuffler "vld1.8 {d8}, [%3] \n" // shuffler
"1: \n" "1: \n"
@ -2757,7 +2757,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants "vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n" "vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n" "vdup.u8 d21, d0[1] \n"
@ -2807,7 +2807,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants "vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n" "vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n" "vdup.u8 d21, d0[1] \n"
@ -2851,7 +2851,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants "vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n" "vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n" "vdup.u8 d21, d0[1] \n"
@ -2903,7 +2903,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int dst_width, int dst_width,
int source_y_fraction) { int source_y_fraction) {
int y1_fraction = source_y_fraction; int y1_fraction = source_y_fraction;
asm volatile ( asm volatile(
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
@ -2965,7 +2965,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
int y0_fraction = 256 - y1_fraction; int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride; const uint16_t* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile(
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
@ -3020,7 +3020,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"subs %3, #8 \n" "subs %3, #8 \n"
"blt 89f \n" "blt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
void ARGBAttenuateRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up "vmov.u16 q15, #0x00ff \n" // 255 for rounding up
// Attenuate 8 pixels. // Attenuate 8 pixels.
@ -3108,7 +3108,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_size, int interval_size,
int interval_offset, int interval_offset,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.u16 q8, %2 \n" "vdup.u16 q8, %2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1 "vshr.u16 q8, q8, #1 \n" // scale >>= 1
"vdup.u16 q9, %3 \n" // interval multiply. "vdup.u16 q9, %3 \n" // interval multiply.
@ -3150,7 +3150,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int width, int width,
uint32_t value) { uint32_t value) {
asm volatile ( asm volatile(
"vdup.u32 q0, %3 \n" // duplicate scale value. "vdup.u32 q0, %3 \n" // duplicate scale value.
"vzip.u8 d0, d1 \n" // d0 aarrggbb. "vzip.u8 d0, d1 \n" // d0 aarrggbb.
"vshr.u16 q0, q0, #1 \n" // scale / 2. "vshr.u16 q0, q0, #1 \n" // scale / 2.
@ -3184,7 +3184,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Similar to ARGBToYJ but stores ARGB. // Similar to ARGBToYJ but stores ARGB.
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
@ -3211,7 +3211,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// g = (r * 45 + g * 88 + b * 22) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d20, #17 \n" // BB coefficient "vmov.u8 d20, #17 \n" // BB coefficient
"vmov.u8 d21, #68 \n" // BG coefficient "vmov.u8 d21, #68 \n" // BG coefficient
"vmov.u8 d22, #35 \n" // BR coefficient "vmov.u8 d22, #35 \n" // BR coefficient
@ -3252,7 +3252,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const int8_t* matrix_argb, const int8_t* matrix_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
@ -3311,7 +3311,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -3340,7 +3340,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -3363,7 +3363,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -3390,7 +3390,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely, const uint8_t* src_sobely,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
@ -3415,7 +3415,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely, const uint8_t* src_sobely,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile ( asm volatile(
// 16 pixel loop. // 16 pixel loop.
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
@ -3441,7 +3441,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely, const uint8_t* src_sobely,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
@ -3468,7 +3468,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
const uint8_t* src_y2, const uint8_t* src_y2,
uint8_t* dst_sobelx, uint8_t* dst_sobelx,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n" "vld1.8 {d1}, [%0],%6 \n"
@ -3506,7 +3506,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
const uint8_t* src_y1, const uint8_t* src_y1,
uint8_t* dst_sobely, uint8_t* dst_sobely,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n" "vld1.8 {d1}, [%1],%4 \n"
@ -3543,7 +3543,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst, uint16_t* dst,
float scale, float scale,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts "vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts
@ -3564,11 +3564,11 @@ void HalfFloatRow_NEON(const uint16_t* src,
"vqshrn.u32 d1, q9, #13 \n" "vqshrn.u32 d1, q9, #13 \n"
"vqshrn.u32 d2, q10, #13 \n" "vqshrn.u32 d2, q10, #13 \n"
"vqshrn.u32 d3, q11, #13 \n" "vqshrn.u32 d3, q11, #13 \n"
"vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16 "vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "w"(scale * 1.9259299444e-34f) // %3 : "w"(scale * 1.9259299444e-34f) // %3
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
} }
@ -3577,7 +3577,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
float* dst, float* dst,
float scale, float scale,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 bytes "vld1.8 {d2}, [%0]! \n" // load 8 bytes
@ -3606,7 +3606,7 @@ void GaussCol_NEON(const uint16_t* src0,
const uint16_t* src4, const uint16_t* src4,
uint32_t* dst, uint32_t* dst,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u16 d6, #4 \n" // constant 4 "vmov.u16 d6, #4 \n" // constant 4
"vmov.u16 d7, #6 \n" // constant 6 "vmov.u16 d7, #6 \n" // constant 6
@ -3643,7 +3643,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src1 = src + 1; const uint32_t* src1 = src + 1;
const uint32_t* src2 = src + 2; const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3; const uint32_t* src3 = src + 3;
asm volatile ( asm volatile(
"vmov.u32 q10, #4 \n" // constant 4 "vmov.u32 q10, #4 \n" // constant 4
"vmov.u32 q11, #6 \n" // constant 6 "vmov.u32 q11, #6 \n" // constant 6
@ -3681,7 +3681,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
uint8_t* dst_yuv24, uint8_t* dst_yuv24,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld1.8 {q2}, [%0]! \n" // load 16 Y values
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
@ -3705,7 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv, int src_stride_ayuv,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
@ -3736,7 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv, int src_stride_ayuv,
uint8_t* dst_vu, uint8_t* dst_vu,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
@ -3766,7 +3766,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
// Copy row of AYUV Y's into Y. // Copy row of AYUV Y's into Y.
// Similar to ARGBExtractAlphaRow_NEON // Similar to ARGBExtractAlphaRow_NEON
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
@ -3782,7 +3782,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
// Convert UV plane of NV12 to VU of NV21. // Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
"vld2.8 {d1, d3}, [%0]! \n" "vld2.8 {d1, d3}, [%0]! \n"
@ -3805,7 +3805,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
int width) { int width) {
const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_u_1 = src_u + src_stride_u;
const uint8_t* src_v_1 = src_v + src_stride_v; const uint8_t* src_v_1 = src_v + src_stride_v;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 U values "vld1.8 {q0}, [%0]! \n" // load 16 U values
"vld1.8 {q1}, [%2]! \n" // load 16 V values "vld1.8 {q1}, [%2]! \n" // load 16 V values
@ -3836,7 +3836,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
int depth, int depth,
int width) { int width) {
int shift = depth - 16; // Negative for right shift. int shift = depth - 16; // Negative for right shift.
asm volatile ( asm volatile(
"vdup.16 q2, %4 \n" "vdup.16 q2, %4 \n"
"1: \n" "1: \n"
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
@ -3860,7 +3860,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
int depth, int depth,
int width) { int width) {
int shift = 16 - depth; int shift = 16 - depth;
asm volatile ( asm volatile(
"vdup.16 q2, %4 \n" "vdup.16 q2, %4 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q0}, [%0]! \n" // load 8 U
@ -3882,7 +3882,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y, uint16_t* dst_y,
int scale, int scale,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.16 q2, %3 \n" "vdup.16 q2, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" "vld1.16 {q0}, [%0]! \n"
@ -3904,7 +3904,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y, uint16_t* dst_y,
int scale, int scale,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.16 d8, %3 \n" "vdup.16 d8, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q2, q3}, [%0]! \n" "vld1.16 {q2, q3}, [%0]! \n"
@ -3936,7 +3936,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
int scale, int scale,
int width) { int width) {
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
asm volatile ( asm volatile(
"vdup.16 q2, %3 \n" "vdup.16 q2, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" "vld1.16 {q0}, [%0]! \n"

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,7 @@ extern "C" {
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
{ \ { \
asm volatile ("csrwi vxrm, 0"); \ asm volatile("csrwi vxrm, 0"); \
ub = yuvconst->kUVCoeff[0]; \ ub = yuvconst->kUVCoeff[0]; \
vr = yuvconst->kUVCoeff[1]; \ vr = yuvconst->kUVCoeff[1]; \
ug = yuvconst->kUVCoeff[2]; \ ug = yuvconst->kUVCoeff[2]; \
@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
// To match behavior on other platforms, vxrm (fixed-point rounding mode // To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) sets to round-to-nearest-up mode(0). // register) sets to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
if (is_yb_positive) { if (is_yb_positive) {
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
} else { } else {
@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
} }
// To match behavior on other platforms, vxrm (fixed-point rounding mode // To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up(0). // register) is set to round-to-nearest-up(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
// Blend 50 / 50. // Blend 50 / 50.
if (y1_fraction == 128) { if (y1_fraction == 128) {
do { do {

View File

@ -241,7 +241,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
const int16_t* uvconstants) { const int16_t* uvconstants) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb; const uint8_t* src_argb_1 = src_argb + src_stride_argb;
uint64_t vl; uint64_t vl;
asm volatile ( asm volatile(
"ptrue p0.b \n" "ptrue p0.b \n"
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"

View File

@ -10,8 +10,8 @@
#include "libyuv/scale.h" #include "libyuv/scale.h"
#include <limits.h>
#include <assert.h> #include <assert.h>
#include <limits.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -1233,10 +1233,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
(void)src_fourcc; // TODO(fbarchard): implement and/or assert. (void)src_fourcc; // TODO(fbarchard): implement and/or assert.
(void)dst_fourcc; (void)dst_fourcc;
const int abs_src_height = (src_height < 0) ? -src_height : src_height; const int abs_src_height = (src_height < 0) ? -src_height : src_height;
if (!src_y || !src_u || !src_v || !dst_argb || if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 || src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
dst_width <= 0 || dst_height <= 0 || dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
clip_width <= 0 || clip_height <= 0) {
return -1; return -1;
} }
const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4; const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
@ -1250,9 +1249,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height); argb_buffer, src_width * 4, src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, dst_argb, r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y, dst_argb, dst_stride_argb, dst_width, dst_height, clip_x,
clip_width, clip_height, filtering); clip_y, clip_width, clip_height, filtering);
free(argb_buffer); free(argb_buffer);
return r; return r;
} }

View File

@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -123,7 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n"
@ -154,7 +154,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n"
@ -195,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -221,7 +221,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -254,7 +254,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -297,7 +297,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n" "psrld $0x18,%%xmm5 \n"
"pslld $0x10,%%xmm5 \n" "pslld $0x10,%%xmm5 \n"
@ -328,7 +328,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
intptr_t stridex3; intptr_t stridex3;
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm4,%%xmm5 \n"
@ -383,7 +383,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n"
@ -416,7 +416,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpsllw $0x3,%%ymm4,%%ymm5 \n" "vpsllw $0x3,%%ymm4,%%ymm5 \n"
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"movdqa %0,%%xmm3 \n" "movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n" "movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n" "movdqa %2,%%xmm5 \n"
@ -481,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf1), // %1 "m"(kShuf1), // %1
"m"(kShuf2) // %2 "m"(kShuf2) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm2 \n"
@ -508,7 +508,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21 "movdqa %2,%%xmm4 \n" // kShuf21
@ -517,7 +517,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1 "m"(kShuf11), // %1
"m"(kShuf21) // %2 "m"(kShuf21) // %2
); );
asm volatile ( asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34 "movdqa %2,%%xmm1 \n" // kRound34
@ -526,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kMadd11), // %1 "m"(kMadd11), // %1
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -572,7 +572,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21 "movdqa %2,%%xmm4 \n" // kShuf21
@ -581,7 +581,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1 "m"(kShuf11), // %1
"m"(kShuf21) // %2 "m"(kShuf21) // %2
); );
asm volatile ( asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34 "movdqa %2,%%xmm1 \n" // kRound34
@ -591,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -641,7 +641,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
@ -671,7 +671,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" "movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n" "movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n" "movdqa %2,%%xmm4 \n"
@ -682,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb2), // %2 "m"(kShufAb2), // %2
"m"(kScaleAb2) // %3 "m"(kScaleAb2) // %3
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n"
@ -714,7 +714,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" "movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n" "movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n" "movdqa %2,%%xmm4 \n"
@ -724,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAc3), // %1 "m"(kShufAc3), // %1
"m"(kScaleAc33) // %2 "m"(kScaleAc33) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n"
@ -782,7 +782,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0 "pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n" "pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n" "psrlw $15,%%xmm6 \n"
@ -838,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0 "pxor %%xmm0,%%xmm0 \n" // 0
// above line // above line
@ -951,7 +951,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"pcmpeqw %%xmm4,%%xmm4 \n" "pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n" "psrlw $15,%%xmm4 \n"
@ -1003,7 +1003,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm7,%%xmm7 \n" "pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" "psrlw $15,%%xmm7 \n"
"psllw $3,%%xmm7 \n" // all 8 "psllw $3,%%xmm7 \n" // all 8
@ -1101,7 +1101,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n" "pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n" "psrld $31,%%xmm4 \n"
@ -1154,7 +1154,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n" "pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n" "psrld $31,%%xmm6 \n"
@ -1262,7 +1262,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n" "pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n" "psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2 "psllw $1,%%xmm4 \n" // all 2
@ -1303,7 +1303,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n" "pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n" "psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8 "psllw $3,%%xmm6 \n" // all 8
@ -1388,7 +1388,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -1432,7 +1432,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
@ -1514,7 +1514,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vbroadcastf128 %3,%%ymm5 \n" "vbroadcastf128 %3,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
@ -1566,7 +1566,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
@ -1628,7 +1628,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -1678,7 +1678,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
@ -1761,11 +1761,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile("pxor %%xmm5,%%xmm5 \n"
"pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm3 \n" "movdqu (%0),%%xmm3 \n"
"lea 0x10(%0),%0 \n" // src_ptr += 16 "lea 0x10(%0),%0 \n" // src_ptr += 16
@ -1781,11 +1780,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(src_width) // %2 "+r"(src_width) // %2
: :
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
} }
#ifdef HAS_SCALEADDROW_AVX2 #ifdef HAS_SCALEADDROW_AVX2
@ -1793,10 +1792,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
void ScaleAddRow_AVX2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm3 \n" "vmovdqu (%0),%%ymm3 \n"
"lea 0x20(%0),%0 \n" // src_ptr += 32 "lea 0x20(%0),%0 \n" // src_ptr += 32
@ -1811,11 +1809,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_ptr), // %0 : "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1 "+r"(dst_ptr), // %1
"+r"(src_width) // %2 "+r"(src_width) // %2
: :
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
} }
#endif // HAS_SCALEADDROW_AVX2 #endif // HAS_SCALEADDROW_AVX2
@ -1835,7 +1833,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int x, int x,
int dx) { int dx) {
intptr_t x0, x1, temp_pixel; intptr_t x0, x1, temp_pixel;
asm volatile ( asm volatile(
"movd %6,%%xmm2 \n" "movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n" "movd %7,%%xmm3 \n"
"movl $0x04040000,%k2 \n" "movl $0x04040000,%k2 \n"
@ -1932,7 +1930,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
@ -1957,7 +1955,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1979,7 +1977,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -2003,7 +2001,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -2037,7 +2035,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12; intptr_t src_stepx_x12;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"lea 0x00(,%1,4),%1 \n" "lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%1,%1,2),%4 \n"
@ -2074,7 +2072,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12; intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride); intptr_t row1 = (intptr_t)(src_stride);
asm volatile ( asm volatile(
"lea 0x00(,%1,4),%1 \n" "lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%1,%1,2),%4 \n"
"lea 0x00(%0,%5,1),%5 \n" "lea 0x00(%0,%5,1),%5 \n"
@ -2117,7 +2115,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int x, int x,
int dx) { int dx) {
intptr_t x0, x1; intptr_t x0, x1;
asm volatile ( asm volatile(
"movd %5,%%xmm2 \n" "movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n" "movd %6,%%xmm3 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n"
@ -2188,7 +2186,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
@ -2226,7 +2224,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int x, int x,
int dx) { int dx) {
intptr_t x0, x1; intptr_t x0, x1;
asm volatile ( asm volatile(
"movdqa %0,%%xmm4 \n" "movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n" "movdqa %1,%%xmm5 \n"
: :
@ -2234,7 +2232,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"m"(kShuffleFractions) // %1 "m"(kShuffleFractions) // %1
); );
asm volatile ( asm volatile(
"movd %5,%%xmm2 \n" "movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n" "movd %6,%%xmm3 \n"
"pcmpeqb %%xmm6,%%xmm6 \n" "pcmpeqb %%xmm6,%%xmm6 \n"
@ -2297,7 +2295,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) { int FixedDiv_X86(int num, int div) {
asm volatile ( asm volatile(
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n" "shl $0x10,%%eax \n"
@ -2311,7 +2309,7 @@ int FixedDiv_X86(int num, int div) {
// Divide num - 1 by div - 1 and return as 16.16 fixed point result. // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) { int FixedDiv1_X86(int num, int div) {
asm volatile ( asm volatile(
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n" "shl $0x10,%%eax \n"
@ -2343,7 +2341,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n"
@ -2383,7 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -2427,7 +2425,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n" "pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n" "psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2 "psllw $1,%%xmm4 \n" // all 2
@ -2468,7 +2466,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n" "pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n" "psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8 "psllw $3,%%xmm6 \n" // all 8
@ -2552,7 +2550,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -2595,7 +2593,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
@ -2675,7 +2673,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n" "pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n" "psrld $31,%%xmm4 \n"
@ -2727,7 +2725,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n" "pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n" "psrld $31,%%xmm6 \n"
@ -2818,7 +2816,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -2867,7 +2865,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8

View File

@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3; const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q1}, [%3]! \n"
@ -155,7 +155,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
@ -173,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
@ -230,7 +230,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
@ -282,7 +282,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
@ -306,7 +306,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride * 2; const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile ( asm volatile(
"vld1.16 {q13}, [%5] \n" "vld1.16 {q13}, [%5] \n"
"vld1.8 {q14}, [%6] \n" "vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vld1.16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
@ -509,7 +509,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"vmov.u8 d30, #3 \n" "vmov.u8 d30, #3 \n"
"1: \n" "1: \n"
@ -546,7 +546,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1; const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n" "vmov.u8 d28, #3 \n"
@ -608,7 +608,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"1: \n" "1: \n"
@ -644,7 +644,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"1: \n" "1: \n"
@ -695,7 +695,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"vmov.u16 d31, #3 \n" "vmov.u16 d31, #3 \n"
"1: \n" "1: \n"
@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"vmov.u16 d31, #3 \n" "vmov.u16 d31, #3 \n"
"vmov.u32 q14, #3 \n" "vmov.u32 q14, #3 \n"
@ -791,7 +791,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"vmov.u8 d30, #3 \n" "vmov.u8 d30, #3 \n"
"1: \n" "1: \n"
@ -828,7 +828,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2; const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n" "vmov.u8 d28, #3 \n"
@ -890,7 +890,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"vmov.u16 d30, #3 \n" "vmov.u16 d30, #3 \n"
"1: \n" "1: \n"
@ -935,7 +935,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2; const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"vmov.u16 d30, #3 \n" "vmov.u16 d30, #3 \n"
"vmov.u32 q14, #3 \n" "vmov.u32 q14, #3 \n"
@ -988,7 +988,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
void ScaleAddRow_NEON(const uint8_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.16 {q1, q2}, [%1] \n" // load accumulator
"vld1.8 {q0}, [%0]! \n" // load 16 bytes "vld1.8 {q0}, [%0]! \n" // load 16 bytes
@ -1086,7 +1086,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
@ -1114,7 +1114,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1174,7 +1174,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
@ -1198,7 +1198,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
int src_stepx, int src_stepx,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1246,7 +1246,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int dx) { int dx) {
int tmp; int tmp;
const uint8_t* src_tmp = src_argb; const uint8_t* src_tmp = src_argb;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// clang-format off // clang-format off
LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 0)
@ -1349,7 +1349,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
@ -1368,7 +1368,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
@ -1387,7 +1387,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1422,7 +1422,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6; const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[0]}, [%0], %6 \n"
"vld1.16 {d0[1]}, [%1], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n"

View File

@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -172,18 +172,18 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"ld1 {v29.16b}, [%[kShuf34_0]] \n" "ld1 {v29.16b}, [%[kShuf34_0]] \n"
"ld1 {v30.16b}, [%[kShuf34_1]] \n" "ld1 {v30.16b}, [%[kShuf34_1]] \n"
"ld1 {v31.16b}, [%[kShuf34_2]] \n" "ld1 {v31.16b}, [%[kShuf34_2]] \n"
"1: \n" "1: \n"
"ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n" "ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
"subs %w[width], %w[width], #48 \n" "subs %w[width], %w[width], #48 \n"
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n" "prfm pldl1keep, [%[src_ptr], 448] \n"
"tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n" "tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n"
"tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n" "tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n"
"st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n" "st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(dst_width) // %[width] [width] "+r"(dst_width) // %[width]
@ -326,7 +326,7 @@ static const vec16 kMult38_Div664 = {
65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0}; 65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12, static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,
65536 / 18, 65536 / 18, 65536 / 12, 65536 / 18, 65536 / 18, 65536 / 12,
0, 0}; 0, 0};
// 32 -> 12 // 32 -> 12
void ScaleRowDown38_NEON(const uint8_t* src_ptr, void ScaleRowDown38_NEON(const uint8_t* src_ptr,
@ -335,26 +335,26 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"ld1 {v3.16b}, [%[kShuf38]] \n" "ld1 {v3.16b}, [%[kShuf38]] \n"
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
"b.eq 2f \n" "b.eq 2f \n"
"1: \n" "1: \n"
"ldp q0, q1, [%[src_ptr]], #32 \n" "ldp q0, q1, [%[src_ptr]], #32 \n"
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead
"str q2, [%[dst_ptr]] \n" "str q2, [%[dst_ptr]] \n"
"add %[dst_ptr], %[dst_ptr], #12 \n" "add %[dst_ptr], %[dst_ptr], #12 \n"
"b.gt 1b \n" "b.gt 1b \n"
// Store exactly 12 bytes on the final iteration to avoid writing past // Store exactly 12 bytes on the final iteration to avoid writing past
// the end of the array. // the end of the array.
"2: \n" "2: \n"
"ldp q0, q1, [%[src_ptr]] \n" "ldp q0, q1, [%[src_ptr]] \n"
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
"st1 {v2.8b}, [%[dst_ptr]], #8 \n" "st1 {v2.8b}, [%[dst_ptr]], #8 \n"
"st1 {v2.s}[2], [%[dst_ptr]] \n" "st1 {v2.s}[2], [%[dst_ptr]] \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[width] "+r"(dst_width) // %[width] [width] "+r"(dst_width) // %[width]
@ -378,49 +378,49 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
asm volatile( asm volatile(
"ld1 {v27.16b}, [%[tblArray1]] \n" "ld1 {v27.16b}, [%[tblArray1]] \n"
"ld1 {v28.16b}, [%[tblArray2]] \n" "ld1 {v28.16b}, [%[tblArray2]] \n"
"ld1 {v29.16b}, [%[tblArray3]] \n" "ld1 {v29.16b}, [%[tblArray3]] \n"
"ld1 {v31.16b}, [%[tblArray4]] \n" "ld1 {v31.16b}, [%[tblArray4]] \n"
"ld1 {v30.16b}, [%[div996]] \n" "ld1 {v30.16b}, [%[div996]] \n"
"1: \n" "1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" "ldp q20, q0, [%[src_ptr]], #32 \n"
"ldp q21, q1, [%[src_ptr1]], #32 \n" "ldp q21, q1, [%[src_ptr1]], #32 \n"
"ldp q22, q2, [%[src_ptr2]], #32 \n" "ldp q22, q2, [%[src_ptr2]], #32 \n"
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
// Add across strided rows first. // Add across strided rows first.
"uaddl v23.8h, v20.8b, v21.8b \n" "uaddl v23.8h, v20.8b, v21.8b \n"
"uaddl v3.8h, v0.8b, v1.8b \n" "uaddl v3.8h, v0.8b, v1.8b \n"
"uaddl2 v24.8h, v20.16b, v21.16b \n" "uaddl2 v24.8h, v20.16b, v21.16b \n"
"uaddl2 v4.8h, v0.16b, v1.16b \n" "uaddl2 v4.8h, v0.16b, v1.16b \n"
"uaddw v23.8h, v23.8h, v22.8b \n" "uaddw v23.8h, v23.8h, v22.8b \n"
"uaddw v3.8h, v3.8h, v2.8b \n" "uaddw v3.8h, v3.8h, v2.8b \n"
"uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ... "uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ...
"uaddw2 v4.8h, v4.8h, v2.16b \n" "uaddw2 v4.8h, v4.8h, v2.16b \n"
// Permute groups of {three,three,two} into separate vectors to sum. // Permute groups of {three,three,two} into separate vectors to sum.
"tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ... "tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ...
"tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n" "tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n"
"tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ... "tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ...
"tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n" "tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n"
"tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0... "tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0...
"tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n" "tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n"
"add v23.8h, v20.8h, v21.8h \n" "add v23.8h, v20.8h, v21.8h \n"
"add v3.8h, v0.8h, v1.8h \n" "add v3.8h, v0.8h, v1.8h \n"
"add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h "add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h
"add v4.8h, v3.8h, v2.8h \n" "add v4.8h, v3.8h, v2.8h \n"
"sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6} "sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6}
"sqrdmulh v25.8h, v4.8h, v30.8h \n" "sqrdmulh v25.8h, v4.8h, v30.8h \n"
"tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow. "tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow.
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n" "st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n" "st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1] [src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
@ -446,41 +446,41 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile( asm volatile(
"ld1 {v28.16b}, [%[tblArray1]] \n" "ld1 {v28.16b}, [%[tblArray1]] \n"
"ld1 {v29.16b}, [%[tblArray2]] \n" "ld1 {v29.16b}, [%[tblArray2]] \n"
"ld1 {v31.16b}, [%[tblArray3]] \n" "ld1 {v31.16b}, [%[tblArray3]] \n"
"ld1 {v30.8h}, [%[div664]] \n" "ld1 {v30.8h}, [%[div664]] \n"
"1: \n" "1: \n"
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ... "ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ... "ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
"subs %w[width], %w[width], #12 \n" "subs %w[width], %w[width], #12 \n"
// Permute into groups of six values (three pairs) to be summed. // Permute into groups of six values (three pairs) to be summed.
"tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ... "tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ...
"tbl v2.16b, {v0.16b}, v28.16b \n" "tbl v2.16b, {v0.16b}, v28.16b \n"
"tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ... "tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ...
"tbl v3.16b, {v1.16b}, v28.16b \n" "tbl v3.16b, {v1.16b}, v28.16b \n"
"tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ... "tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ...
"tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n" "tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n"
"uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ... "uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ...
"uaddlp v2.8h, v2.16b \n" "uaddlp v2.8h, v2.16b \n"
"uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ... "uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ...
"uaddlp v3.8h, v3.16b \n" "uaddlp v3.8h, v3.16b \n"
"uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ... "uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ...
"uaddlp v4.8h, v4.16b \n" "uaddlp v4.8h, v4.16b \n"
"add v20.8h, v22.8h, v23.8h \n" "add v20.8h, v22.8h, v23.8h \n"
"add v0.8h, v2.8h, v3.8h \n" "add v0.8h, v2.8h, v3.8h \n"
"add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ... "add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ...
"add v1.8h, v0.8h, v4.8h \n" "add v1.8h, v0.8h, v4.8h \n"
"sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4} "sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4}
"sqrdmulh v22.8h, v1.8h, v30.8h \n" "sqrdmulh v22.8h, v1.8h, v30.8h \n"
"tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow. "tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow.
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n" "st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n" "st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1] [src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
@ -543,7 +543,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1; const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"movi v30.8h, #3 \n" "movi v30.8h, #3 \n"
@ -599,7 +599,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -636,7 +636,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -690,7 +690,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -735,7 +735,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"movi v31.4h, #3 \n" "movi v31.4h, #3 \n"
"movi v30.4s, #3 \n" "movi v30.4s, #3 \n"
@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"1: \n" "1: \n"
@ -829,7 +829,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2; const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"movi v30.8h, #3 \n" "movi v30.8h, #3 \n"
@ -885,7 +885,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -932,7 +932,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2; const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"movi v31.4h, #3 \n" "movi v31.4h, #3 \n"
"movi v30.4s, #3 \n" "movi v30.4s, #3 \n"
@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
void ScaleAddRow_NEON(const uint8_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
@ -1043,14 +1043,14 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
"trn1 v21.8h, v2.8h, v0.8h \n" "trn1 v21.8h, v2.8h, v0.8h \n"
"1: \n" SCALE_FILTER_COLS_STEP_ADDR "1: \n" SCALE_FILTER_COLS_STEP_ADDR
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR "ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
"ld1 {v6.h}[7], [%[tmp_ptr]] \n" "ld1 {v6.h}[7], [%[tmp_ptr]] \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop "subs %w[width], %w[width], #8 \n" // 8 processed per loop
"trn1 v4.16b, v6.16b, v0.16b \n" "trn1 v4.16b, v6.16b, v0.16b \n"
@ -1090,14 +1090,14 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"prfm pldl1keep, [%[src], 448] \n" "prfm pldl1keep, [%[src], 448] \n"
"uzp2 v0.4s, v0.4s, v1.4s \n" "uzp2 v0.4s, v0.4s, v1.4s \n"
"uzp2 v1.4s, v2.4s, v3.4s \n" "uzp2 v1.4s, v2.4s, v3.4s \n"
"st1 {v0.4s, v1.4s}, [%[dst]], #32 \n" "st1 {v0.4s, v1.4s}, [%[dst]], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src] "+r"(src_ptr), // %[src] : [src] "+r"(src_ptr), // %[src]
[dst] "+r"(dst), // %[dst] [dst] "+r"(dst), // %[dst]
[width] "+r"(dst_width) // %[width] [width] "+r"(dst_width) // %[width]
@ -1113,15 +1113,15 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1 = src_argb + 32; const uint8_t* src_argb1 = src_argb + 32;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]] \n" "ld2 {v0.4s, v1.4s}, [%[src]] \n"
"add %[src], %[src], #64 \n" "add %[src], %[src], #64 \n"
"ld2 {v2.4s, v3.4s}, [%[src1]] \n" "ld2 {v2.4s, v3.4s}, [%[src1]] \n"
"add %[src1], %[src1], #64 \n" "add %[src1], %[src1], #64 \n"
"urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n"
"urhadd v1.16b, v2.16b, v3.16b \n" "urhadd v1.16b, v2.16b, v3.16b \n"
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"st1 {v0.16b, v1.16b}, [%[dst]], #32 \n" "st1 {v0.16b, v1.16b}, [%[dst]], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src] "+r"(src_argb), // %[src] : [src] "+r"(src_argb), // %[src]
[src1] "+r"(src_argb1), // %[src1] [src1] "+r"(src_argb1), // %[src1]
[dst] "+r"(dst_argb), // %[dst] [dst] "+r"(dst_argb), // %[dst]
@ -1135,21 +1135,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
"uaddl v2.8h, v0.8b, v1.8b \n" "uaddl v2.8h, v0.8b, v1.8b \n"
"uaddl2 v3.8h, v0.16b, v1.16b \n" "uaddl2 v3.8h, v0.16b, v1.16b \n"
"uaddl v22.8h, v20.8b, v21.8b \n" "uaddl v22.8h, v20.8b, v21.8b \n"
"uaddl2 v23.8h, v20.16b, v21.16b \n" "uaddl2 v23.8h, v20.16b, v21.16b \n"
"add v0.8h, v2.8h, v22.8h \n" "add v0.8h, v2.8h, v22.8h \n"
"add v1.8h, v3.8h, v23.8h \n" "add v1.8h, v3.8h, v23.8h \n"
"rshrn v0.8b, v0.8h, #2 \n" "rshrn v0.8b, v0.8h, #2 \n"
"rshrn v1.8b, v1.8h, #2 \n" "rshrn v1.8b, v1.8h, #2 \n"
"subs %w[width], %w[width], #4 \n" "subs %w[width], %w[width], #4 \n"
"stp d0, d1, [%[dst]], #16 \n" "stp d0, d1, [%[dst]], #16 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst), : [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst),
[width] "+r"(dst_width) [width] "+r"(dst_width)
: :
@ -1166,26 +1166,22 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
const uint8_t* src_argb3 = src_argb + src_stepx * 12; const uint8_t* src_argb3 = src_argb + src_stepx * 12;
int64_t i = 0; int64_t i = 0;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ldr w10, [%[src], %[i]] \n" "ldr w10, [%[src], %[i]] \n"
"ldr w11, [%[src1], %[i]] \n" "ldr w11, [%[src1], %[i]] \n"
"ldr w12, [%[src2], %[i]] \n" "ldr w12, [%[src2], %[i]] \n"
"ldr w13, [%[src3], %[i]] \n" "ldr w13, [%[src3], %[i]] \n"
"add %[i], %[i], %[step] \n" "add %[i], %[i], %[step] \n"
"subs %w[width], %w[width], #4 \n" "subs %w[width], %w[width], #4 \n"
"prfm pldl1keep, [%[src], 448] \n" "prfm pldl1keep, [%[src], 448] \n"
"stp w10, w11, [%[dst]], #8 \n" "stp w10, w11, [%[dst]], #8 \n"
"stp w12, w13, [%[dst]], #8 \n" "stp w12, w13, [%[dst]], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src]"+r"(src_argb), : [src] "+r"(src_argb), [src1] "+r"(src_argb1), [src2] "+r"(src_argb2),
[src1]"+r"(src_argb1), [src3] "+r"(src_argb3), [dst] "+r"(dst_argb), [width] "+r"(dst_width),
[src2]"+r"(src_argb2), [i] "+r"(i)
[src3]"+r"(src_argb3), : [step] "r"((int64_t)(src_stepx * 16))
[dst]"+r"(dst_argb),
[width]"+r"(dst_width),
[i]"+r"(i)
: [step]"r"((int64_t)(src_stepx * 16))
: "memory", "cc", "w10", "w11", "w12", "w13"); : "memory", "cc", "w10", "w11", "w12", "w13");
} }
@ -1312,33 +1308,33 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
"1: \n" // "1: \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ldr d1, [%6] \n" // "ldr d1, [%6] \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ldr d2, [%6] \n" "ldr d2, [%6] \n"
"shrn v4.4h, v5.4s, #9 \n" // "shrn v4.4h, v5.4s, #9 \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ld1 {v1.d}[1], [%6] \n" // "ld1 {v1.d}[1], [%6] \n" //
SCALE_ARGB_FILTER_COLS_STEP_ADDR SCALE_ARGB_FILTER_COLS_STEP_ADDR
"ld1 {v2.d}[1], [%6] \n" "ld1 {v2.d}[1], [%6] \n"
"subs %w2, %w2, #4 \n" // 4 processed per loop "subs %w2, %w2, #4 \n" // 4 processed per loop
"and v4.8b, v4.8b, v3.8b \n" "and v4.8b, v4.8b, v3.8b \n"
"trn1 v0.4s, v1.4s, v2.4s \n" "trn1 v0.4s, v1.4s, v2.4s \n"
"tbl v4.16b, {v4.16b}, v18.16b \n" // f "tbl v4.16b, {v4.16b}, v18.16b \n" // f
"trn2 v1.4s, v1.4s, v2.4s \n" "trn2 v1.4s, v1.4s, v2.4s \n"
"eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f "eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f
"umull v16.8h, v1.8b, v4.8b \n" "umull v16.8h, v1.8b, v4.8b \n"
"umull2 v17.8h, v1.16b, v4.16b \n" "umull2 v17.8h, v1.16b, v4.16b \n"
"umlal v16.8h, v0.8b, v7.8b \n" "umlal v16.8h, v0.8b, v7.8b \n"
"umlal2 v17.8h, v0.16b, v7.16b \n" "umlal2 v17.8h, v0.16b, v7.16b \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"shrn v0.8b, v16.8h, #7 \n" "shrn v0.8b, v16.8h, #7 \n"
"shrn v1.8b, v17.8h, #7 \n" "shrn v1.8b, v17.8h, #7 \n"
"add v5.4s, v5.4s, v6.4s \n" "add v5.4s, v5.4s, v6.4s \n"
"stp d0, d1, [%0], #16 \n" // store pixels "stp d0, d1, [%0], #16 \n" // store pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(dst_argb), // %0 : "+r"(dst_argb), // %0
"+r"(src_argb), // %1 "+r"(src_argb), // %1
"+r"(dst_width), // %2 "+r"(dst_width), // %2
@ -1360,34 +1356,34 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"subs %w[dst_width], %w[dst_width], #32 \n" "subs %w[dst_width], %w[dst_width], #32 \n"
"b.lt 2f \n" "b.lt 2f \n"
"1: \n" "1: \n"
"ldp q0, q1, [%[src_ptr]] \n" "ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n" "ldp q2, q3, [%[src_ptr], #32] \n"
"ldp q4, q5, [%[src_ptr], #64] \n" "ldp q4, q5, [%[src_ptr], #64] \n"
"ldp q6, q7, [%[src_ptr], #96] \n" "ldp q6, q7, [%[src_ptr], #96] \n"
"add %[src_ptr], %[src_ptr], #128 \n" "add %[src_ptr], %[src_ptr], #128 \n"
"uzp2 v0.8h, v0.8h, v1.8h \n" "uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n" "uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n" "uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n" "uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
"stp q0, q1, [%[dst_ptr]] \n" "stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n" "stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n" "add %[dst_ptr], %[dst_ptr], #64 \n"
"b.ge 1b \n" "b.ge 1b \n"
"2: \n" "2: \n"
"adds %w[dst_width], %w[dst_width], #32 \n" "adds %w[dst_width], %w[dst_width], #32 \n"
"b.eq 99f \n" "b.eq 99f \n"
"ldp q0, q1, [%[src_ptr]] \n" "ldp q0, q1, [%[src_ptr]] \n"
"ldp q2, q3, [%[src_ptr], #32] \n" "ldp q2, q3, [%[src_ptr], #32] \n"
"uzp2 v0.8h, v0.8h, v1.8h \n" "uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n" "uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%[dst_ptr]] \n" "stp q0, q1, [%[dst_ptr]] \n"
"99: \n" "99: \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
@ -1403,15 +1399,15 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n" "ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n" "ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
"subs %w[dst_width], %w[dst_width], #16 \n" "subs %w[dst_width], %w[dst_width], #16 \n"
"urhadd v0.8h, v0.8h, v1.8h \n" "urhadd v0.8h, v0.8h, v1.8h \n"
"urhadd v1.8h, v2.8h, v3.8h \n" "urhadd v1.8h, v2.8h, v3.8h \n"
"prfm pldl1keep, [%[src_ptr], 448] \n" "prfm pldl1keep, [%[src_ptr], 448] \n"
"stp q0, q1, [%[dst_ptr]], #32 \n" "stp q0, q1, [%[dst_ptr]], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src_ptr] "+r"(src_ptr), // %[src_ptr] : [src_ptr] "+r"(src_ptr), // %[src_ptr]
[dst_ptr] "+r"(dst), // %[dst_ptr] [dst_ptr] "+r"(dst), // %[dst_ptr]
[dst_width] "+r"(dst_width) // %[dst_width] [dst_width] "+r"(dst_width) // %[dst_width]
@ -1424,7 +1420,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint16_t* dst, uint16_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n" "1: \n"
@ -1455,7 +1451,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -1474,7 +1470,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -1493,7 +1489,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1528,7 +1524,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6; const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n"

View File

@ -10,8 +10,8 @@
#include "libyuv/scale.h" /* For FilterMode */ #include "libyuv/scale.h" /* For FilterMode */
#include <limits.h>
#include <assert.h> #include <assert.h>
#include <limits.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -41,9 +41,9 @@ int RGBScale(const uint8_t* src_rgb,
int dst_height, int dst_height,
enum FilterMode filtering) { enum FilterMode filtering) {
int r; int r;
if (!src_rgb || !dst_rgb || if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 || src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
dst_width <= 0 || dst_width > INT_MAX / 4 || dst_height <= 0) { dst_height <= 0) {
return -1; return -1;
} }
const int abs_src_height = (src_height < 0) ? -src_height : src_height; const int abs_src_height = (src_height < 0) ? -src_height : src_height;

View File

@ -149,7 +149,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
const uint32_t* src = (const uint32_t*)(src_argb); const uint32_t* src = (const uint32_t*)(src_argb);
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_odd, v_even, v_dst; vuint8m4_t v_odd, v_even, v_dst;
vuint32m4_t v_odd_32, v_even_32; vuint32m4_t v_odd_32, v_even_32;
@ -214,7 +214,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst; vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16; vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
@ -311,7 +311,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
const int stride_byte = src_stepx * 4; const int stride_byte = src_stepx * 4;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst; vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
vuint16m8_t v_row0_sum, v_row1_sum, v_sum; vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
@ -389,7 +389,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
(void)src_stride; (void)src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_s0, v_s1, v_dst; vuint8m4_t v_s0, v_s1, v_dst;
size_t vl = __riscv_vsetvl_e8m4(w); size_t vl = __riscv_vsetvl_e8m4(w);
@ -444,7 +444,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
size_t w = (size_t)dst_width; size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
size_t vl = __riscv_vsetvl_e8m4(w); size_t vl = __riscv_vsetvl_e8m4(w);
vuint8m4_t v_s0, v_s1, v_t0, v_t1; vuint8m4_t v_s0, v_s1, v_t0, v_t1;
@ -577,7 +577,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
size_t w = (size_t)dst_width; size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint8m2_t v_t0, v_t1, v_t2, v_t3; vuint8m2_t v_t0, v_t1, v_t2, v_t3;
@ -747,7 +747,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
const uint8_t* t = src_ptr + src_stride; const uint8_t* t = src_ptr + src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
@ -876,7 +876,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
const uint8_t* t = src_ptr + src_stride; const uint8_t* t = src_ptr + src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
@ -1539,7 +1539,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
(void)src_stride; (void)src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_u0v0, v_u1v1, v_avg; vuint8m4_t v_u0v0, v_u1v1, v_avg;
vuint16m4_t v_u0v0_16, v_u1v1_16; vuint16m4_t v_u0v0_16, v_u1v1_16;
@ -1608,7 +1608,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
size_t w = (size_t)dst_width; size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0; vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1; vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;

View File

@ -15,7 +15,6 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__) defined(__aarch64__)

View File

@ -333,14 +333,14 @@ static void ScaleUVDownEven(int src_width,
#endif #endif
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV) #if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV)
if (TestCpuFlag(kCpuHasRVV) && !filtering) { if (TestCpuFlag(kCpuHasRVV) && !filtering) {
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) #if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
ScaleUVRowDownEven = ScaleUVRowDownEven_RVV; ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
#endif #endif
#if defined(HAS_SCALEUVROWDOWN4_RVV) #if defined(HAS_SCALEUVROWDOWN4_RVV)
if (col_step == 4) { if (col_step == 4) {
ScaleUVRowDownEven = ScaleUVRowDown4_RVV; ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
} }
#endif #endif
} }
#endif #endif

View File

@ -12,6 +12,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare.h" #include "libyuv/compare.h"
#include "libyuv/convert.h" #include "libyuv/convert.h"
@ -19,7 +20,6 @@
#include "libyuv/convert_from.h" #include "libyuv/convert_from.h"
#include "libyuv/convert_from_argb.h" #include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "../unit_test/unit_test.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#include "libyuv/video_common.h" #include "libyuv/video_common.h"

View File

@ -67,16 +67,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
#endif #endif
#ifdef __linux__ #ifdef __linux__
static void KernelVersion(int *version) { static void KernelVersion(int* version) {
struct utsname buffer; struct utsname buffer;
int i = 0; int i = 0;
version[0] = version[1] = 0; version[0] = version[1] = 0;
if (uname(&buffer) == 0) { if (uname(&buffer) == 0) {
char *v = buffer.release; char* v = buffer.release;
for (i = 0; *v && i < 2; ++v) { for (i = 0; *v && i < 2; ++v) {
if (isdigit(*v)) { if (isdigit(*v)) {
version[i++] = (int) strtol(v, &v, 10); version[i++] = (int)strtol(v, &v, 10);
} }
} }
} }
@ -142,8 +142,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
// Read and print the RVV vector length. // Read and print the RVV vector length.
if (has_rvv) { if (has_rvv) {
register uint32_t vlenb __asm__ ("t0"); register uint32_t vlenb __asm__("t0");
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb)); __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
printf("RVV vector length: %d bytes\n", vlenb); printf("RVV vector length: %d bytes\n", vlenb);
} }
} }
@ -161,7 +161,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
#if defined(__loongarch__) #if defined(__loongarch__)
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
if (has_loongarch) { if (has_loongarch) {
int has_lsx = TestCpuFlag(kCpuHasLSX); int has_lsx = TestCpuFlag(kCpuHasLSX);
int has_lasx = TestCpuFlag(kCpuHasLASX); int has_lasx = TestCpuFlag(kCpuHasLASX);
printf("Has LOONGARCH 0x%x\n", has_loongarch); printf("Has LOONGARCH 0x%x\n", has_loongarch);
printf("Has LSX 0x%x\n", has_lsx); printf("Has LSX 0x%x\n", has_lsx);
@ -169,8 +169,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
} }
#endif // defined(__loongarch__) #endif // defined(__loongarch__)
#if defined(__i386__) || defined(__x86_64__) || \ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_IX86) || defined(_M_X64) defined(_M_X64)
int has_x86 = TestCpuFlag(kCpuHasX86); int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) { if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@ -215,7 +215,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
printf("Has AMXINT8 0x%x\n", has_amxint8); printf("Has AMXINT8 0x%x\n", has_amxint8);
} }
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
// defined(_M_X64)
} }
TEST_F(LibYUVBaseTest, TestCompilerMacros) { TEST_F(LibYUVBaseTest, TestCompilerMacros) {

View File

@ -1570,18 +1570,21 @@ static int TestCopyPlane(int benchmark_width,
// Disable all optimizations. // Disable all optimizations.
MaskCpuFlags(disable_cpu_flags); MaskCpuFlags(disable_cpu_flags);
for (int i = 0; i < benchmark_iterations; i++) { for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert); CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width,
benchmark_width, benchmark_height * invert);
} }
// Enable optimizations. // Enable optimizations.
MaskCpuFlags(benchmark_cpu_info); MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; i++) { for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert); CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width,
benchmark_width, benchmark_height * invert);
} }
int max_diff = 0; int max_diff = 0;
for (int i = 0; i < y_plane_size; ++i) { for (int i = 0; i < y_plane_size; ++i) {
int abs_diff = abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i])); int abs_diff =
abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
if (abs_diff > max_diff) { if (abs_diff > max_diff) {
max_diff = abs_diff; max_diff = abs_diff;
} }
@ -1596,29 +1599,29 @@ static int TestCopyPlane(int benchmark_width,
TEST_F(LibYUVPlanarTest, CopyPlane_Any) { TEST_F(LibYUVPlanarTest, CopyPlane_Any) {
int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_, int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_,
benchmark_iterations_, disable_cpu_flags_, benchmark_iterations_, disable_cpu_flags_,
benchmark_cpu_info_, +1, 0); benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 0); EXPECT_LE(max_diff, 0);
} }
TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) { TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) {
int max_diff = int max_diff =
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1); disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
EXPECT_LE(max_diff, 0); EXPECT_LE(max_diff, 0);
} }
TEST_F(LibYUVPlanarTest, CopyPlane_Invert) { TEST_F(LibYUVPlanarTest, CopyPlane_Invert) {
int max_diff = int max_diff =
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, -1, 0); disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
EXPECT_LE(max_diff, 0); EXPECT_LE(max_diff, 0);
} }
TEST_F(LibYUVPlanarTest, CopyPlane_Opt) { TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
int max_diff = int max_diff =
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
EXPECT_LE(max_diff, 0); EXPECT_LE(max_diff, 0);
} }
@ -2499,17 +2502,19 @@ static int TestHalfFloatPlane(int benchmark_width,
// Disable all optimizations. // Disable all optimizations.
MaskCpuFlags(disable_cpu_flags); MaskCpuFlags(disable_cpu_flags);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2, HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2, benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_c),
scale, benchmark_width, benchmark_height * invert); benchmark_width * 2, scale, benchmark_width,
benchmark_height * invert);
} }
// Enable optimizations. // Enable optimizations.
MaskCpuFlags(benchmark_cpu_info); MaskCpuFlags(benchmark_cpu_info);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2, HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2, benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_opt),
scale, benchmark_width, benchmark_height * invert); benchmark_width * 2, scale, benchmark_width,
benchmark_height * invert);
} }
int max_diff = 0; int max_diff = 0;
@ -2536,23 +2541,23 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
@ -2564,59 +2569,57 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
#if defined(__arm__) #if defined(__arm__)
static void EnableFlushDenormalToZero(void) { static void EnableFlushDenormalToZero(void) {
uint32_t cw; uint32_t cw;
asm volatile ( asm volatile(
"vmrs %0, fpscr \n" "vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n" "orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n" "vmsr fpscr, %0 \n"
: "=r"(cw) : "=r"(cw)::"memory", "cc"); // Clobber List
::"memory", "cc"); // Clobber List
} }
static void DisableFlushDenormalToZero(void) { static void DisableFlushDenormalToZero(void) {
uint32_t cw; uint32_t cw;
asm volatile ( asm volatile(
"vmrs %0, fpscr \n" "vmrs %0, fpscr \n"
"bic %0, %0, #0x1000000 \n" "bic %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n" "vmsr fpscr, %0 \n"
: "=r"(cw) : "=r"(cw)::"memory", "cc"); // Clobber List
::"memory", "cc"); // Clobber List
} }
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
@ -2626,18 +2629,18 @@ static void DisableFlushDenormalToZero(void) {
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
// 32 bit arm rounding on denormal case is off by 1 compared to C. // 32 bit arm rounding on denormal case is off by 1 compared to C.
EnableFlushDenormalToZero(); EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
DisableFlushDenormalToZero(); DisableFlushDenormalToZero();
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
EnableFlushDenormalToZero(); EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
DisableFlushDenormalToZero(); DisableFlushDenormalToZero();
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
@ -3184,8 +3187,9 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
tmp_pixels_c_b, benchmark_width_, benchmark_width_, tmp_pixels_c_b, benchmark_width_, benchmark_width_,
benchmark_height_); benchmark_height_);
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g, MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c, benchmark_width_, tmp_pixels_c_b, benchmark_width_,
benchmark_width_ * 3, benchmark_width_, benchmark_height_); dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_); MaskCpuFlags(benchmark_cpu_info_);
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r, SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r,
@ -3244,8 +3248,9 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
tmp_pixels_c_b, benchmark_width_, benchmark_width_, tmp_pixels_c_b, benchmark_width_, benchmark_width_,
benchmark_height_); benchmark_height_);
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g, MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c, benchmark_width_, tmp_pixels_c_b, benchmark_width_,
benchmark_width_ * 3, benchmark_width_, benchmark_height_); dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_); MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
@ -3446,8 +3451,8 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL,
dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, 0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
benchmark_height_); benchmark_height_);
} }
@ -3502,8 +3507,8 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r, SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r,
benchmark_width_, tmp_pixels_opt_g, benchmark_width_, benchmark_width_, tmp_pixels_opt_g, benchmark_width_,
tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0,
benchmark_height_); benchmark_width_, benchmark_height_);
} }
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,

View File

@ -320,16 +320,16 @@ TEST_FACTOR(3, 1, 3)
#ifndef DISABLE_SLOW_TESTS #ifndef DISABLE_SLOW_TESTS
// Test scale to a specified size with all 4 filters. // Test scale to a specified size with all 4 filters.
#define TEST_SCALETO(name, width, height) \ #define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(, name, width, height, None, 0) \ TEST_SCALETO1(, name, width, height, None, 0) \
TEST_SCALETO1(, name, width, height, Linear, 3) \ TEST_SCALETO1(, name, width, height, Linear, 3) \
TEST_SCALETO1(, name, width, height, Bilinear, 3) \ TEST_SCALETO1(, name, width, height, Bilinear, 3) \
TEST_SCALETO1(, name, width, height, Box, 3) TEST_SCALETO1(, name, width, height, Box, 3)
#else #else
#if defined(ENABLE_FULL_TESTS) #if defined(ENABLE_FULL_TESTS)
#define TEST_SCALETO(name, width, height) \ #define TEST_SCALETO(name, width, height) \
TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
TEST_SCALETO1(DISABLED_, name, width, height, Box, 3) TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
#else #else

View File

@ -1058,7 +1058,7 @@ TEST_SCALETO(Scale, 320, 240)
TEST_SCALETO(Scale, 1280, 720) TEST_SCALETO(Scale, 1280, 720)
TEST_SCALETO(Scale, 1920, 1080) TEST_SCALETO(Scale, 1920, 1080)
TEST_SCALETO(Scale, 1080, 1920) // for rotated phones TEST_SCALETO(Scale, 1080, 1920) // for rotated phones
#endif // DISABLE_SLOW_TESTS #endif // DISABLE_SLOW_TESTS
#undef TEST_SCALETO1 #undef TEST_SCALETO1
#undef TEST_SCALETO #undef TEST_SCALETO