mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Apply clang format
Bug: None Change-Id: I0d9db4b384144523e61ae32b6ab3f72e93a0c265 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6138934 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
parent
b5a18f9d93
commit
e0040eb318
@ -67,7 +67,6 @@ static const int kCpuHasLOONGARCH = 0x20;
|
||||
static const int kCpuHasLSX = 0x100;
|
||||
static const int kCpuHasLASX = 0x200;
|
||||
|
||||
|
||||
// Optional init function. TestCpuFlag does an auto-init.
|
||||
// Returns cpu_info flags.
|
||||
LIBYUV_API
|
||||
|
||||
@ -3613,9 +3613,9 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_AVX512BW(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width);
|
||||
void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int scale,
|
||||
|
||||
@ -499,8 +499,8 @@ static inline void I422ToRGB565Row_SVE_SC(
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"cnth %[vl] \n"
|
||||
"whilelt p1.b, wzr, %w[width] \n" //
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X
|
||||
RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
|
||||
RGB8TORGB565_SVE_FROM_TOP_2X
|
||||
"st2h {z18.h, z19.h}, p1, [%[dst]] \n"
|
||||
|
||||
"99: \n"
|
||||
@ -558,8 +558,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"cnth %[vl] \n"
|
||||
"whilelt p1.b, wzr, %w[width] \n" //
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X
|
||||
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
|
||||
RGB8TOARGB1555_SVE_FROM_TOP_2X
|
||||
"st2h {z0.h, z1.h}, p1, [%[dst]] \n"
|
||||
|
||||
"99: \n"
|
||||
@ -617,8 +617,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
|
||||
// Calculate a predicate for the final iteration to deal with the tail.
|
||||
"cnth %[vl] \n"
|
||||
"whilelt p1.b, wzr, %w[width] \n" //
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X
|
||||
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
|
||||
READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
|
||||
RGB8TOARGB4444_SVE_FROM_TOP_2X
|
||||
"st2h {z0.h, z1.h}, p1, [%[dst]] \n"
|
||||
|
||||
"99: \n"
|
||||
|
||||
@ -29,7 +29,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
||||
int count) {
|
||||
uint64_t diff;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"xor %3,%3 \n"
|
||||
"xor %%r8,%%r8 \n"
|
||||
"xor %%r9,%%r9 \n"
|
||||
@ -77,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
||||
int count) {
|
||||
uint32_t diff = 0u;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Process 16 bytes per loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -121,7 +121,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %4,%%xmm2 \n"
|
||||
"movdqa %5,%%xmm3 \n"
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
@ -180,7 +180,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vbroadcastf128 %4,%%ymm2 \n"
|
||||
"vbroadcastf128 %5,%%ymm3 \n"
|
||||
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
@ -234,7 +234,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pxor %%xmm0,%%xmm0 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
@ -300,7 +300,7 @@ static const uvec32 kHashMul3 = {
|
||||
|
||||
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
|
||||
uint32_t hash;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movd %2,%%xmm0 \n"
|
||||
"pxor %%xmm7,%%xmm7 \n"
|
||||
"movdqa %4,%%xmm6 \n"
|
||||
|
||||
@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 q4, #0 \n" // accumulator
|
||||
|
||||
"1: \n"
|
||||
@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
|
||||
@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v4.8h, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t sse;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v16.16b, #0 \n"
|
||||
"movi v17.16b, #0 \n"
|
||||
"movi v18.16b, #0 \n"
|
||||
@ -116,30 +116,30 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) {
|
||||
uint32_t hash = seed;
|
||||
const uint32_t c16 = 0x92d9e201; // 33^16
|
||||
uint32_t tmp, tmp2;
|
||||
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
|
||||
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
|
||||
asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n"
|
||||
"ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n"
|
||||
|
||||
// count is always a multiple of 16.
|
||||
// maintain two accumulators, reduce and then final sum in scalar since
|
||||
// this has better performance on little cores.
|
||||
"1: \n"
|
||||
"ldr q0, [%[src]], #16 \n"
|
||||
"subs %w[count], %w[count], #16 \n"
|
||||
"tbl v3.16b, {v0.16b}, v19.16b \n"
|
||||
"tbl v2.16b, {v0.16b}, v18.16b \n"
|
||||
"tbl v1.16b, {v0.16b}, v17.16b \n"
|
||||
"tbl v0.16b, {v0.16b}, v16.16b \n"
|
||||
"mul v3.4s, v3.4s, v7.4s \n"
|
||||
"mul v2.4s, v2.4s, v6.4s \n"
|
||||
"mla v3.4s, v1.4s, v5.4s \n"
|
||||
"mla v2.4s, v0.4s, v4.4s \n"
|
||||
"addv s1, v3.4s \n"
|
||||
"addv s0, v2.4s \n"
|
||||
"fmov %w[tmp2], s1 \n"
|
||||
"fmov %w[tmp], s0 \n"
|
||||
"add %w[tmp], %w[tmp], %w[tmp2] \n"
|
||||
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ldr q0, [%[src]], #16 \n"
|
||||
"subs %w[count], %w[count], #16 \n"
|
||||
"tbl v3.16b, {v0.16b}, v19.16b \n"
|
||||
"tbl v2.16b, {v0.16b}, v18.16b \n"
|
||||
"tbl v1.16b, {v0.16b}, v17.16b \n"
|
||||
"tbl v0.16b, {v0.16b}, v16.16b \n"
|
||||
"mul v3.4s, v3.4s, v7.4s \n"
|
||||
"mul v2.4s, v2.4s, v6.4s \n"
|
||||
"mla v3.4s, v1.4s, v5.4s \n"
|
||||
"mla v2.4s, v0.4s, v4.4s \n"
|
||||
"addv s1, v3.4s \n"
|
||||
"addv s0, v2.4s \n"
|
||||
"fmov %w[tmp2], s1 \n"
|
||||
"fmov %w[tmp], s0 \n"
|
||||
"add %w[tmp], %w[tmp], %w[tmp2] \n"
|
||||
"madd %w[hash], %w[hash], %w[c16], %w[tmp] \n"
|
||||
"b.gt 1b \n"
|
||||
: [hash] "+r"(hash), // %[hash]
|
||||
[count] "+r"(count), // %[count]
|
||||
[tmp] "=&r"(tmp), // %[tmp]
|
||||
@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
|
||||
const uint8_t* src_b,
|
||||
int count) {
|
||||
uint32_t diff;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v4.4s, #0 \n"
|
||||
"movi v5.4s, #0 \n"
|
||||
"movi v6.16b, #1 \n"
|
||||
@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
|
||||
int count) {
|
||||
// count is guaranteed to be a multiple of 32.
|
||||
uint32_t sse;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v4.4s, #0 \n"
|
||||
"movi v5.4s, #0 \n"
|
||||
|
||||
|
||||
@ -665,7 +665,7 @@ int I010ToNV12(const uint16_t* src_y,
|
||||
void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
|
||||
int width) = Convert16To8Row_C;
|
||||
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
|
||||
uint8_t* dst_uv, int width) = MergeUVRow_C;
|
||||
uint8_t* dst_uv, int width) = MergeUVRow_C;
|
||||
if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
|
||||
@ -70,9 +70,8 @@ int ConvertToARGB(const uint8_t* sample,
|
||||
uint8_t* rotate_buffer = NULL;
|
||||
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
|
||||
|
||||
if (dst_argb == NULL || sample == NULL ||
|
||||
src_width <= 0 || src_width > INT_MAX / 4 ||
|
||||
crop_width <= 0 || crop_width > INT_MAX / 4 ||
|
||||
if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
|
||||
src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
|
||||
src_height == 0 || crop_height == 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -81,7 +80,8 @@ int ConvertToARGB(const uint8_t* sample,
|
||||
}
|
||||
|
||||
if (need_buf) {
|
||||
const uint64_t rotate_buffer_size = (uint64_t)crop_width * 4 * abs_crop_height;
|
||||
const uint64_t rotate_buffer_size =
|
||||
(uint64_t)crop_width * 4 * abs_crop_height;
|
||||
if (rotate_buffer_size > SIZE_MAX) {
|
||||
return -1; // Invalid size.
|
||||
}
|
||||
|
||||
@ -65,8 +65,9 @@ int ConvertToI420(const uint8_t* sample,
|
||||
const int inv_crop_height =
|
||||
(src_height < 0) ? -abs_crop_height : abs_crop_height;
|
||||
|
||||
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || src_width > INT_MAX / 4 ||
|
||||
crop_width <= 0 || src_height == 0 || crop_height == 0) {
|
||||
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
|
||||
src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
|
||||
crop_height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@ -78,7 +79,8 @@ int ConvertToI420(const uint8_t* sample,
|
||||
if (need_buf) {
|
||||
int y_size = crop_width * abs_crop_height;
|
||||
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
|
||||
const uint64_t rotate_buffer_size = (uint64_t)y_size + (uint64_t)uv_size * 2;
|
||||
const uint64_t rotate_buffer_size =
|
||||
(uint64_t)y_size + (uint64_t)uv_size * 2;
|
||||
if (rotate_buffer_size > SIZE_MAX) {
|
||||
return -1; // Invalid size.
|
||||
}
|
||||
|
||||
@ -191,7 +191,8 @@ static int ARGBRotate180(const uint8_t* src_argb,
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
|
||||
CopyRow =
|
||||
IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_ERMS)
|
||||
|
||||
@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
|
||||
uint8_t* dst_b,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
LABELALIGN
|
||||
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n" // a b c d
|
||||
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%xmm0 \n" // a b c d
|
||||
|
||||
@ -27,57 +27,57 @@ void TransposeWx8_NEON(const uint8_t* src,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* temp;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %[width], #8 \n"
|
||||
"sub %[width], #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d2}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d3}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d4}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d5}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d6}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d7}, [%[temp]] \n"
|
||||
"add %[src], #8 \n"
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"vld1.8 {d0}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d1}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d2}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d3}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d4}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d5}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d6}, [%[temp]], %[src_stride] \n"
|
||||
"vld1.8 {d7}, [%[temp]] \n"
|
||||
"add %[src], #8 \n"
|
||||
|
||||
"vtrn.8 d1, d0 \n"
|
||||
"vtrn.8 d3, d2 \n"
|
||||
"vtrn.8 d5, d4 \n"
|
||||
"vtrn.8 d7, d6 \n"
|
||||
"subs %[width], #8 \n"
|
||||
"vtrn.8 d1, d0 \n"
|
||||
"vtrn.8 d3, d2 \n"
|
||||
"vtrn.8 d5, d4 \n"
|
||||
"vtrn.8 d7, d6 \n"
|
||||
"subs %[width], #8 \n"
|
||||
|
||||
"vtrn.16 d1, d3 \n"
|
||||
"vtrn.16 d0, d2 \n"
|
||||
"vtrn.16 d5, d7 \n"
|
||||
"vtrn.16 d4, d6 \n"
|
||||
"vtrn.16 d1, d3 \n"
|
||||
"vtrn.16 d0, d2 \n"
|
||||
"vtrn.16 d5, d7 \n"
|
||||
"vtrn.16 d4, d6 \n"
|
||||
|
||||
"vtrn.32 d1, d5 \n"
|
||||
"vtrn.32 d0, d4 \n"
|
||||
"vtrn.32 d3, d7 \n"
|
||||
"vtrn.32 d2, d6 \n"
|
||||
"vtrn.32 d1, d5 \n"
|
||||
"vtrn.32 d0, d4 \n"
|
||||
"vtrn.32 d3, d7 \n"
|
||||
"vtrn.32 d2, d6 \n"
|
||||
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
|
||||
"mov %[temp], %[dst] \n"
|
||||
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d6}, [%[temp]] \n"
|
||||
"add %[dst], %[dst], %[dst_stride], lsl #3 \n"
|
||||
"mov %[temp], %[dst] \n"
|
||||
"vst1.8 {d1}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d0}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d3}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d2}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d5}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d4}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d7}, [%[temp]], %[dst_stride] \n"
|
||||
"vst1.8 {d6}, [%[temp]] \n"
|
||||
"add %[dst], %[dst], %[dst_stride], lsl #3 \n"
|
||||
|
||||
"bge 1b \n"
|
||||
"bge 1b \n"
|
||||
: [temp] "=&r"(temp), // %[temp]
|
||||
[src] "+r"(src), // %[src]
|
||||
[dst] "+r"(dst), // %[dst]
|
||||
@ -95,72 +95,72 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
const uint8_t* temp;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %[width], #8 \n"
|
||||
"sub %[width], #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d22, d23}, [%[temp]] \n"
|
||||
"add %[src], #8*2 \n"
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n"
|
||||
"vld2.8 {d22, d23}, [%[temp]] \n"
|
||||
"add %[src], #8*2 \n"
|
||||
|
||||
"vtrn.8 q1, q0 \n"
|
||||
"vtrn.8 q3, q2 \n"
|
||||
"vtrn.8 q9, q8 \n"
|
||||
"vtrn.8 q11, q10 \n"
|
||||
"subs %[width], #8 \n"
|
||||
"vtrn.8 q1, q0 \n"
|
||||
"vtrn.8 q3, q2 \n"
|
||||
"vtrn.8 q9, q8 \n"
|
||||
"vtrn.8 q11, q10 \n"
|
||||
"subs %[width], #8 \n"
|
||||
|
||||
"vtrn.16 q1, q3 \n"
|
||||
"vtrn.16 q0, q2 \n"
|
||||
"vtrn.16 q9, q11 \n"
|
||||
"vtrn.16 q8, q10 \n"
|
||||
"vtrn.16 q1, q3 \n"
|
||||
"vtrn.16 q0, q2 \n"
|
||||
"vtrn.16 q9, q11 \n"
|
||||
"vtrn.16 q8, q10 \n"
|
||||
|
||||
"vtrn.32 q1, q9 \n"
|
||||
"vtrn.32 q0, q8 \n"
|
||||
"vtrn.32 q3, q11 \n"
|
||||
"vtrn.32 q2, q10 \n"
|
||||
"vtrn.32 q1, q9 \n"
|
||||
"vtrn.32 q0, q8 \n"
|
||||
"vtrn.32 q3, q11 \n"
|
||||
"vtrn.32 q2, q10 \n"
|
||||
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
"vrev16.8 q8, q8 \n"
|
||||
"vrev16.8 q9, q9 \n"
|
||||
"vrev16.8 q10, q10 \n"
|
||||
"vrev16.8 q11, q11 \n"
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
"vrev16.8 q8, q8 \n"
|
||||
"vrev16.8 q9, q9 \n"
|
||||
"vrev16.8 q10, q10 \n"
|
||||
"vrev16.8 q11, q11 \n"
|
||||
|
||||
"mov %[temp], %[dst_a] \n"
|
||||
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d20}, [%[temp]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
||||
"mov %[temp], %[dst_a] \n"
|
||||
"vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n"
|
||||
"vst1.8 {d20}, [%[temp]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
||||
|
||||
"mov %[temp], %[dst_b] \n"
|
||||
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d21}, [%[temp]] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
||||
"mov %[temp], %[dst_b] \n"
|
||||
"vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n"
|
||||
"vst1.8 {d21}, [%[temp]] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
||||
|
||||
"bge 1b \n"
|
||||
"bge 1b \n"
|
||||
: [temp] "=&r"(temp), // %[temp]
|
||||
[src] "+r"(src), // %[src]
|
||||
[dst_a] "+r"(dst_a), // %[dst_a]
|
||||
@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
|
||||
|
||||
@ -27,104 +27,104 @@ void TransposeWx16_NEON(const uint8_t* src,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src_temp;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"mov %[src_temp], %[src] \n"
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"mov %[src_temp], %[src] \n"
|
||||
|
||||
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
"ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n"
|
||||
|
||||
"add %[src], %[src], #16 \n"
|
||||
"add %[src], %[src], #16 \n"
|
||||
|
||||
// Transpose bytes within each 2x2 block.
|
||||
"trn1 v0.16b, v16.16b, v17.16b \n"
|
||||
"trn2 v1.16b, v16.16b, v17.16b \n"
|
||||
"trn1 v2.16b, v18.16b, v19.16b \n"
|
||||
"trn2 v3.16b, v18.16b, v19.16b \n"
|
||||
"trn1 v4.16b, v20.16b, v21.16b \n"
|
||||
"trn2 v5.16b, v20.16b, v21.16b \n"
|
||||
"trn1 v6.16b, v22.16b, v23.16b \n"
|
||||
"trn2 v7.16b, v22.16b, v23.16b \n"
|
||||
"trn1 v8.16b, v24.16b, v25.16b \n"
|
||||
"trn2 v9.16b, v24.16b, v25.16b \n"
|
||||
"trn1 v10.16b, v26.16b, v27.16b \n"
|
||||
"trn2 v11.16b, v26.16b, v27.16b \n"
|
||||
"trn1 v12.16b, v28.16b, v29.16b \n"
|
||||
"trn2 v13.16b, v28.16b, v29.16b \n"
|
||||
"trn1 v14.16b, v30.16b, v31.16b \n"
|
||||
"trn2 v15.16b, v30.16b, v31.16b \n"
|
||||
"trn1 v0.16b, v16.16b, v17.16b \n"
|
||||
"trn2 v1.16b, v16.16b, v17.16b \n"
|
||||
"trn1 v2.16b, v18.16b, v19.16b \n"
|
||||
"trn2 v3.16b, v18.16b, v19.16b \n"
|
||||
"trn1 v4.16b, v20.16b, v21.16b \n"
|
||||
"trn2 v5.16b, v20.16b, v21.16b \n"
|
||||
"trn1 v6.16b, v22.16b, v23.16b \n"
|
||||
"trn2 v7.16b, v22.16b, v23.16b \n"
|
||||
"trn1 v8.16b, v24.16b, v25.16b \n"
|
||||
"trn2 v9.16b, v24.16b, v25.16b \n"
|
||||
"trn1 v10.16b, v26.16b, v27.16b \n"
|
||||
"trn2 v11.16b, v26.16b, v27.16b \n"
|
||||
"trn1 v12.16b, v28.16b, v29.16b \n"
|
||||
"trn2 v13.16b, v28.16b, v29.16b \n"
|
||||
"trn1 v14.16b, v30.16b, v31.16b \n"
|
||||
"trn2 v15.16b, v30.16b, v31.16b \n"
|
||||
|
||||
// Transpose 2x2-byte blocks within each 4x4 block.
|
||||
"trn1 v16.8h, v0.8h, v2.8h \n"
|
||||
"trn1 v17.8h, v1.8h, v3.8h \n"
|
||||
"trn2 v18.8h, v0.8h, v2.8h \n"
|
||||
"trn2 v19.8h, v1.8h, v3.8h \n"
|
||||
"trn1 v20.8h, v4.8h, v6.8h \n"
|
||||
"trn1 v21.8h, v5.8h, v7.8h \n"
|
||||
"trn2 v22.8h, v4.8h, v6.8h \n"
|
||||
"trn2 v23.8h, v5.8h, v7.8h \n"
|
||||
"trn1 v24.8h, v8.8h, v10.8h \n"
|
||||
"trn1 v25.8h, v9.8h, v11.8h \n"
|
||||
"trn2 v26.8h, v8.8h, v10.8h \n"
|
||||
"trn2 v27.8h, v9.8h, v11.8h \n"
|
||||
"trn1 v28.8h, v12.8h, v14.8h \n"
|
||||
"trn1 v29.8h, v13.8h, v15.8h \n"
|
||||
"trn2 v30.8h, v12.8h, v14.8h \n"
|
||||
"trn2 v31.8h, v13.8h, v15.8h \n"
|
||||
"trn1 v16.8h, v0.8h, v2.8h \n"
|
||||
"trn1 v17.8h, v1.8h, v3.8h \n"
|
||||
"trn2 v18.8h, v0.8h, v2.8h \n"
|
||||
"trn2 v19.8h, v1.8h, v3.8h \n"
|
||||
"trn1 v20.8h, v4.8h, v6.8h \n"
|
||||
"trn1 v21.8h, v5.8h, v7.8h \n"
|
||||
"trn2 v22.8h, v4.8h, v6.8h \n"
|
||||
"trn2 v23.8h, v5.8h, v7.8h \n"
|
||||
"trn1 v24.8h, v8.8h, v10.8h \n"
|
||||
"trn1 v25.8h, v9.8h, v11.8h \n"
|
||||
"trn2 v26.8h, v8.8h, v10.8h \n"
|
||||
"trn2 v27.8h, v9.8h, v11.8h \n"
|
||||
"trn1 v28.8h, v12.8h, v14.8h \n"
|
||||
"trn1 v29.8h, v13.8h, v15.8h \n"
|
||||
"trn2 v30.8h, v12.8h, v14.8h \n"
|
||||
"trn2 v31.8h, v13.8h, v15.8h \n"
|
||||
|
||||
"subs %w[width], %w[width], #16 \n"
|
||||
"subs %w[width], %w[width], #16 \n"
|
||||
|
||||
// Transpose 4x4-byte blocks within each 8x8 block.
|
||||
"trn1 v0.4s, v16.4s, v20.4s \n"
|
||||
"trn1 v2.4s, v17.4s, v21.4s \n"
|
||||
"trn1 v4.4s, v18.4s, v22.4s \n"
|
||||
"trn1 v6.4s, v19.4s, v23.4s \n"
|
||||
"trn2 v8.4s, v16.4s, v20.4s \n"
|
||||
"trn2 v10.4s, v17.4s, v21.4s \n"
|
||||
"trn2 v12.4s, v18.4s, v22.4s \n"
|
||||
"trn2 v14.4s, v19.4s, v23.4s \n"
|
||||
"trn1 v1.4s, v24.4s, v28.4s \n"
|
||||
"trn1 v3.4s, v25.4s, v29.4s \n"
|
||||
"trn1 v5.4s, v26.4s, v30.4s \n"
|
||||
"trn1 v7.4s, v27.4s, v31.4s \n"
|
||||
"trn2 v9.4s, v24.4s, v28.4s \n"
|
||||
"trn2 v11.4s, v25.4s, v29.4s \n"
|
||||
"trn2 v13.4s, v26.4s, v30.4s \n"
|
||||
"trn2 v15.4s, v27.4s, v31.4s \n"
|
||||
"trn1 v0.4s, v16.4s, v20.4s \n"
|
||||
"trn1 v2.4s, v17.4s, v21.4s \n"
|
||||
"trn1 v4.4s, v18.4s, v22.4s \n"
|
||||
"trn1 v6.4s, v19.4s, v23.4s \n"
|
||||
"trn2 v8.4s, v16.4s, v20.4s \n"
|
||||
"trn2 v10.4s, v17.4s, v21.4s \n"
|
||||
"trn2 v12.4s, v18.4s, v22.4s \n"
|
||||
"trn2 v14.4s, v19.4s, v23.4s \n"
|
||||
"trn1 v1.4s, v24.4s, v28.4s \n"
|
||||
"trn1 v3.4s, v25.4s, v29.4s \n"
|
||||
"trn1 v5.4s, v26.4s, v30.4s \n"
|
||||
"trn1 v7.4s, v27.4s, v31.4s \n"
|
||||
"trn2 v9.4s, v24.4s, v28.4s \n"
|
||||
"trn2 v11.4s, v25.4s, v29.4s \n"
|
||||
"trn2 v13.4s, v26.4s, v30.4s \n"
|
||||
"trn2 v15.4s, v27.4s, v31.4s \n"
|
||||
|
||||
// Transpose 8x8-byte blocks and store.
|
||||
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
"st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n"
|
||||
|
||||
"b.gt 1b \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src), // %[src]
|
||||
[src_temp] "=&r"(src_temp), // %[src_temp]
|
||||
[dst] "+r"(dst), // %[dst]
|
||||
@ -145,76 +145,76 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
const uint8_t* temp;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w[width], %w[width], #8 \n"
|
||||
"sub %w[width], %w[width], #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v7.16b}, [%[temp]] \n"
|
||||
"add %[src], %[src], #16 \n"
|
||||
"1: \n"
|
||||
"mov %[temp], %[src] \n"
|
||||
"ld1 {v0.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v1.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v2.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v3.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v4.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v5.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v6.16b}, [%[temp]], %[src_stride] \n"
|
||||
"ld1 {v7.16b}, [%[temp]] \n"
|
||||
"add %[src], %[src], #16 \n"
|
||||
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
|
||||
"mov %[temp], %[dst_a] \n"
|
||||
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v19.d}[1], [%[temp]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
||||
"mov %[temp], %[dst_a] \n"
|
||||
"st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n"
|
||||
"st1 {v19.d}[1], [%[temp]] \n"
|
||||
"add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n"
|
||||
|
||||
"mov %[temp], %[dst_b] \n"
|
||||
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v23.d}[1], [%[temp]] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
||||
"mov %[temp], %[dst_b] \n"
|
||||
"st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n"
|
||||
"st1 {v23.d}[1], [%[temp]] \n"
|
||||
"add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n"
|
||||
|
||||
"b.ge 1b \n"
|
||||
"b.ge 1b \n"
|
||||
: [temp] "=&r"(temp), // %[temp]
|
||||
[src] "+r"(src), // %[src]
|
||||
[dst_a] "+r"(dst_a), // %[dst_a]
|
||||
@ -239,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -2039,7 +2039,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||
@ -2101,7 +2101,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||
@ -2165,7 +2165,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
|
||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
|
||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
||||
@ -2807,7 +2807,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||
@ -2866,7 +2866,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||
@ -2924,7 +2924,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
|
||||
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
|
||||
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
|
||||
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||
|
||||
@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
RGBTORGB8
|
||||
@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV444 YUVTORGB
|
||||
RGBTORGB8
|
||||
@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgba,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb565,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb1555,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"1: \n" READYUV422 YUVTORGB
|
||||
RGBTORGB8
|
||||
@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb4444,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"vmov.u8 d7, #0x0f \n" // vbic bits to clear
|
||||
@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUV400 YUVTORGB
|
||||
@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
||||
}
|
||||
|
||||
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d23, #255 \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d20}, [%0]! \n"
|
||||
@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
||||
@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READNV21 YUVTORGB RGBTORGB8
|
||||
@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
||||
@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb24,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READNV21 YUVTORGB RGBTORGB8
|
||||
@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_rgb565,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
||||
@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READYUY2 YUVTORGB RGBTORGB8
|
||||
@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
|
||||
uint8_t* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
YUVTORGB_SETUP
|
||||
"vmov.u8 d6, #255 \n"
|
||||
"1: \n" READUYVY YUVTORGB RGBTORGB8
|
||||
@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
|
||||
ptrdiff_t src_tile_stride,
|
||||
uint8_t* dst,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
|
||||
ptrdiff_t src_tile_stride,
|
||||
uint16_t* dst,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d1}, [%0], %4 \n"
|
||||
"subs %3, %3, #16 \n"
|
||||
@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
|
||||
"pld [%0, #1792] \n"
|
||||
@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
|
||||
"vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
|
||||
@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
#endif
|
||||
|
||||
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q14}, [%0]! \n" // Load lower bits.
|
||||
"vld1.8 {q9}, [%0]! \n" // Load upper bits row
|
||||
@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load U
|
||||
"vld1.8 {q1}, [%1]! \n" // load V
|
||||
@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
|
||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
|
||||
@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
|
||||
const uint8_t* src_b,
|
||||
uint8_t* dst_rgb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load R
|
||||
"vld1.8 {q1}, [%1]! \n" // load G
|
||||
@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_b,
|
||||
uint8_t* dst_a,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
|
||||
@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
|
||||
const uint8_t* src_a,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q2}, [%0]! \n" // load R
|
||||
"vld1.8 {q1}, [%1]! \n" // load G
|
||||
@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_g,
|
||||
uint8_t* dst_b,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
|
||||
@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
|
||||
const uint8_t* src_b,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 q3, #255 \n" // load A(255)
|
||||
"1: \n"
|
||||
"vld1.8 {q2}, [%0]! \n" // load R
|
||||
@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 10 - depth;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u32 q14, #1023 \n"
|
||||
"vdup.32 q15, %5 \n"
|
||||
"1: \n"
|
||||
@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
||||
uint8_t* dst_ar30,
|
||||
int /* depth */,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u32 q14, #1023 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {d4}, [%2]! \n" // B
|
||||
@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"vdup.u16 q15, %6 \n"
|
||||
"vdup.u16 q14, %7 \n"
|
||||
@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
int mask = (1 << depth) - 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"vmov.u8 q3, #0xff \n" // A (0xffff)
|
||||
"vdup.u16 q15, %5 \n"
|
||||
@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 8 - depth;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"vdup.16 q15, %6 \n"
|
||||
"1: \n"
|
||||
@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 8 - depth;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"vdup.16 q15, %5 \n"
|
||||
"vmov.u8 d6, #0xff \n" // A (0xff)
|
||||
@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
||||
|
||||
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
||||
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
|
||||
"subs %2, %2, #32 \n" // 32 processed per loop
|
||||
@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
|
||||
// SetRow writes 'width' bytes using an 8 bit value repeated.
|
||||
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.8 q0, %2 \n" // duplicate 16 bytes
|
||||
"1: \n"
|
||||
"subs %1, %1, #16 \n" // 16 bytes per loop
|
||||
@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
|
||||
|
||||
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
|
||||
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"subs %1, %1, #4 \n" // 4 pixels per loop
|
||||
@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
|
||||
}
|
||||
|
||||
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"add %0, %0, %2 \n"
|
||||
"sub %0, %0, #32 \n" // 32 bytes per loop
|
||||
@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||
}
|
||||
|
||||
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"mov r12, #-16 \n"
|
||||
"add %0, %0, %2, lsl #1 \n"
|
||||
@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// Start at end of source row.
|
||||
"mov r12, #-16 \n"
|
||||
"add %0, %0, %3, lsl #1 \n"
|
||||
@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
||||
}
|
||||
|
||||
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %0, %0, %2, lsl #2 \n"
|
||||
"sub %0, #32 \n"
|
||||
|
||||
@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
|
||||
uint8_t* dst_rgb24,
|
||||
int width) {
|
||||
src_rgb24 += width * 3 - 24;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
|
||||
"subs %2, #8 \n" // 8 pixels per loop.
|
||||
@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
|
||||
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d4, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
|
||||
@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
||||
}
|
||||
|
||||
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d4, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||
@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||
}
|
||||
|
||||
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d0, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||
@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
||||
);
|
||||
}
|
||||
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
||||
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
||||
@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
||||
@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
||||
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // Alpha
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
||||
@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
||||
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_rgb24,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||
@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
||||
}
|
||||
|
||||
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
||||
}
|
||||
|
||||
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
||||
}
|
||||
|
||||
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // stride + src_uyvy
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||
@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
const uint8_t* shuffler,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {q2}, [%3] \n" // shuffler
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
|
||||
@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_yuy2,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
|
||||
"vld1.8 {d1}, [%1]! \n" // load 8 Us
|
||||
@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uyvy,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
|
||||
"vld1.8 {d0}, [%1]! \n" // load 8 Us
|
||||
@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
||||
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_rgb565,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_rgb,
|
||||
uint32_t dither4,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.32 d7, %2 \n" // dither4
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
|
||||
@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
|
||||
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb1555,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
||||
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb4444,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d7, #0x0f \n" // bits to clear with
|
||||
// vbic.
|
||||
"1: \n"
|
||||
@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_a,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
|
||||
@ -1839,7 +1839,7 @@ static void ARGBToUV444MatrixRow_NEON(
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants
|
||||
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
|
||||
@ -2367,7 +2367,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
||||
// coefficient
|
||||
@ -2433,7 +2433,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
||||
// coefficient
|
||||
@ -2551,7 +2551,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
}
|
||||
|
||||
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
||||
@ -2577,7 +2577,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
||||
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
||||
@ -2603,7 +2603,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
||||
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
||||
@ -2629,7 +2629,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
||||
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||
uint16_t* dst_ar64,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q2}, [%0]! \n"
|
||||
@ -2652,7 +2652,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
|
||||
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
||||
uint16_t* dst_ab64,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {q4}, [%3] \n" // shuffler
|
||||
|
||||
"1: \n"
|
||||
@ -2678,7 +2678,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
||||
void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
@ -2704,7 +2704,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
|
||||
void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {d8}, [%3] \n" // shuffler
|
||||
|
||||
"1: \n"
|
||||
@ -2757,7 +2757,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||
"vdup.u8 d20, d0[0] \n"
|
||||
"vdup.u8 d21, d0[1] \n"
|
||||
@ -2807,7 +2807,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||
"vdup.u8 d20, d0[0] \n"
|
||||
"vdup.u8 d21, d0[1] \n"
|
||||
@ -2851,7 +2851,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||
"vdup.u8 d20, d0[0] \n"
|
||||
"vdup.u8 d21, d0[1] \n"
|
||||
@ -2903,7 +2903,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
|
||||
int dst_width,
|
||||
int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"cmp %4, #0 \n"
|
||||
"beq 100f \n"
|
||||
"add %2, %1 \n"
|
||||
@ -2965,7 +2965,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"cmp %4, #0 \n"
|
||||
"beq 100f \n"
|
||||
"cmp %4, #128 \n"
|
||||
@ -3020,7 +3020,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"subs %3, #8 \n"
|
||||
"blt 89f \n"
|
||||
// Blend 8 pixels.
|
||||
@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
||||
void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up
|
||||
|
||||
// Attenuate 8 pixels.
|
||||
@ -3108,7 +3108,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
|
||||
int interval_size,
|
||||
int interval_offset,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.u16 q8, %2 \n"
|
||||
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
|
||||
"vdup.u16 q9, %3 \n" // interval multiply.
|
||||
@ -3150,7 +3150,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int width,
|
||||
uint32_t value) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.u32 q0, %3 \n" // duplicate scale value.
|
||||
"vzip.u8 d0, d1 \n" // d0 aarrggbb.
|
||||
"vshr.u16 q0, q0, #1 \n" // scale / 2.
|
||||
@ -3184,7 +3184,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
|
||||
// Similar to ARGBToYJ but stores ARGB.
|
||||
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
|
||||
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
|
||||
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
|
||||
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
|
||||
@ -3211,7 +3211,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||
// g = (r * 45 + g * 88 + b * 22) >> 7
|
||||
// r = (r * 50 + g * 98 + b * 24) >> 7
|
||||
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d20, #17 \n" // BB coefficient
|
||||
"vmov.u8 d21, #68 \n" // BG coefficient
|
||||
"vmov.u8 d22, #35 \n" // BR coefficient
|
||||
@ -3252,7 +3252,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
const int8_t* matrix_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
|
||||
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
|
||||
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
|
||||
@ -3311,7 +3311,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
@ -3340,7 +3340,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
@ -3363,7 +3363,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
@ -3390,7 +3390,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
|
||||
const uint8_t* src_sobely,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // alpha
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
@ -3415,7 +3415,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
|
||||
const uint8_t* src_sobely,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// 16 pixel loop.
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
|
||||
@ -3441,7 +3441,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
|
||||
const uint8_t* src_sobely,
|
||||
uint8_t* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d3, #255 \n" // alpha
|
||||
// 8 pixel loop.
|
||||
"1: \n"
|
||||
@ -3468,7 +3468,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
||||
const uint8_t* src_y2,
|
||||
uint8_t* dst_sobelx,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0],%5 \n" // top
|
||||
"vld1.8 {d1}, [%0],%6 \n"
|
||||
@ -3506,7 +3506,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
||||
const uint8_t* src_y1,
|
||||
uint8_t* dst_sobely,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0],%4 \n" // left
|
||||
"vld1.8 {d1}, [%1],%4 \n"
|
||||
@ -3543,7 +3543,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
||||
uint16_t* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"1: \n"
|
||||
"vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts
|
||||
@ -3564,11 +3564,11 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
||||
"vqshrn.u32 d1, q9, #13 \n"
|
||||
"vqshrn.u32 d2, q10, #13 \n"
|
||||
"vqshrn.u32 d3, q11, #13 \n"
|
||||
"vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
|
||||
"vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16
|
||||
"bgt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "w"(scale * 1.9259299444e-34f) // %3
|
||||
: "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
|
||||
}
|
||||
@ -3577,7 +3577,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
|
||||
@ -3606,7 +3606,7 @@ void GaussCol_NEON(const uint16_t* src0,
|
||||
const uint16_t* src4,
|
||||
uint32_t* dst,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 d6, #4 \n" // constant 4
|
||||
"vmov.u16 d7, #6 \n" // constant 6
|
||||
|
||||
@ -3643,7 +3643,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
const uint32_t* src1 = src + 1;
|
||||
const uint32_t* src2 = src + 2;
|
||||
const uint32_t* src3 = src + 3;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u32 q10, #4 \n" // constant 4
|
||||
"vmov.u32 q11, #6 \n" // constant 6
|
||||
|
||||
@ -3681,7 +3681,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
|
||||
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
|
||||
@ -3705,7 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
int src_stride_ayuv,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // src_stride + src_AYUV
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
||||
@ -3736,7 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
int src_stride_ayuv,
|
||||
uint8_t* dst_vu,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // src_stride + src_AYUV
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
||||
@ -3766,7 +3766,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
// Copy row of AYUV Y's into Y.
|
||||
// Similar to ARGBExtractAlphaRow_NEON
|
||||
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
|
||||
@ -3782,7 +3782,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
|
||||
// Convert UV plane of NV12 to VU of NV21.
|
||||
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
|
||||
"vld2.8 {d1, d3}, [%0]! \n"
|
||||
@ -3805,7 +3805,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
int width) {
|
||||
const uint8_t* src_u_1 = src_u + src_stride_u;
|
||||
const uint8_t* src_v_1 = src_v + src_stride_v;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 16 U values
|
||||
"vld1.8 {q1}, [%2]! \n" // load 16 V values
|
||||
@ -3836,7 +3836,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = depth - 16; // Negative for right shift.
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.16 q2, %4 \n"
|
||||
"1: \n"
|
||||
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
|
||||
@ -3860,7 +3860,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
|
||||
int depth,
|
||||
int width) {
|
||||
int shift = 16 - depth;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.16 q2, %4 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n" // load 8 U
|
||||
@ -3882,7 +3882,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
|
||||
uint16_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.16 q2, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
@ -3904,7 +3904,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
uint16_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.16 d8, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q2, q3}, [%0]! \n"
|
||||
@ -3936,7 +3936,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
int scale,
|
||||
int width) {
|
||||
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vdup.16 q2, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -47,7 +47,7 @@ extern "C" {
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
|
||||
{ \
|
||||
asm volatile ("csrwi vxrm, 0"); \
|
||||
asm volatile("csrwi vxrm, 0"); \
|
||||
ub = yuvconst->kUVCoeff[0]; \
|
||||
vr = yuvconst->kUVCoeff[1]; \
|
||||
ug = yuvconst->kUVCoeff[2]; \
|
||||
@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
|
||||
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
|
||||
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) sets to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
if (is_yb_positive) {
|
||||
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
|
||||
} else {
|
||||
@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
|
||||
}
|
||||
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
// Blend 50 / 50.
|
||||
if (y1_fraction == 128) {
|
||||
do {
|
||||
|
||||
@ -241,7 +241,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
||||
const int16_t* uvconstants) {
|
||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||
uint64_t vl;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"ptrue p0.b \n"
|
||||
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
|
||||
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"
|
||||
|
||||
@ -10,8 +10,8 @@
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
|
||||
#include <limits.h>
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -1233,10 +1233,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
|
||||
(void)src_fourcc; // TODO(fbarchard): implement and/or assert.
|
||||
(void)dst_fourcc;
|
||||
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
if (!src_y || !src_u || !src_v || !dst_argb ||
|
||||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 ||
|
||||
dst_width <= 0 || dst_height <= 0 ||
|
||||
clip_width <= 0 || clip_height <= 0) {
|
||||
if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
|
||||
src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
|
||||
dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
|
||||
@ -1250,9 +1249,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
|
||||
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
|
||||
argb_buffer, src_width * 4, src_width, src_height);
|
||||
|
||||
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, dst_argb,
|
||||
dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
|
||||
clip_width, clip_height, filtering);
|
||||
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height,
|
||||
dst_argb, dst_stride_argb, dst_width, dst_height, clip_x,
|
||||
clip_y, clip_width, clip_height, filtering);
|
||||
free(argb_buffer);
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -123,7 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
@ -154,7 +154,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
@ -195,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
@ -221,7 +221,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
@ -254,7 +254,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
@ -297,7 +297,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrld $0x18,%%xmm5 \n"
|
||||
"pslld $0x10,%%xmm5 \n"
|
||||
@ -328,7 +328,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
intptr_t stridex3;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
@ -383,7 +383,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vpsrld $0x18,%%ymm5,%%ymm5 \n"
|
||||
"vpslld $0x10,%%ymm5,%%ymm5 \n"
|
||||
@ -416,7 +416,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $0x3,%%ymm4,%%ymm5 \n"
|
||||
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm3 \n"
|
||||
"movdqa %1,%%xmm4 \n"
|
||||
"movdqa %2,%%xmm5 \n"
|
||||
@ -481,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kShuf1), // %1
|
||||
"m"(kShuf2) // %2
|
||||
);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm2 \n"
|
||||
@ -508,7 +508,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm2 \n" // kShuf01
|
||||
"movdqa %1,%%xmm3 \n" // kShuf11
|
||||
"movdqa %2,%%xmm4 \n" // kShuf21
|
||||
@ -517,7 +517,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kShuf11), // %1
|
||||
"m"(kShuf21) // %2
|
||||
);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm5 \n" // kMadd01
|
||||
"movdqa %1,%%xmm0 \n" // kMadd11
|
||||
"movdqa %2,%%xmm1 \n" // kRound34
|
||||
@ -526,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kMadd11), // %1
|
||||
"m"(kRound34) // %2
|
||||
);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
||||
@ -572,7 +572,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm2 \n" // kShuf01
|
||||
"movdqa %1,%%xmm3 \n" // kShuf11
|
||||
"movdqa %2,%%xmm4 \n" // kShuf21
|
||||
@ -581,7 +581,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kShuf11), // %1
|
||||
"m"(kShuf21) // %2
|
||||
);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm5 \n" // kMadd01
|
||||
"movdqa %1,%%xmm0 \n" // kMadd11
|
||||
"movdqa %2,%%xmm1 \n" // kRound34
|
||||
@ -591,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kRound34) // %2
|
||||
);
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm6 \n"
|
||||
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
||||
@ -641,7 +641,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
|
||||
@ -671,7 +671,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm2 \n"
|
||||
"movdqa %1,%%xmm3 \n"
|
||||
"movdqa %2,%%xmm4 \n"
|
||||
@ -682,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kShufAb2), // %2
|
||||
"m"(kScaleAb2) // %3
|
||||
);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
|
||||
@ -714,7 +714,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm2 \n"
|
||||
"movdqa %1,%%xmm3 \n"
|
||||
"movdqa %2,%%xmm4 \n"
|
||||
@ -724,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"m"(kShufAc3), // %1
|
||||
"m"(kScaleAc33) // %2
|
||||
);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
|
||||
@ -782,7 +782,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
|
||||
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||
"pcmpeqw %%xmm6,%%xmm6 \n"
|
||||
"psrlw $15,%%xmm6 \n"
|
||||
@ -838,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||
// above line
|
||||
@ -951,7 +951,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm5 \n"
|
||||
"pcmpeqw %%xmm4,%%xmm4 \n"
|
||||
"psrlw $15,%%xmm4 \n"
|
||||
@ -1003,7 +1003,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqw %%xmm7,%%xmm7 \n"
|
||||
"psrlw $15,%%xmm7 \n"
|
||||
"psllw $3,%%xmm7 \n" // all 8
|
||||
@ -1101,7 +1101,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
"pcmpeqd %%xmm4,%%xmm4 \n"
|
||||
"psrld $31,%%xmm4 \n"
|
||||
@ -1154,7 +1154,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pxor %%xmm7,%%xmm7 \n"
|
||||
"pcmpeqd %%xmm6,%%xmm6 \n"
|
||||
"psrld $31,%%xmm6 \n"
|
||||
@ -1262,7 +1262,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqw %%xmm4,%%xmm4 \n"
|
||||
"psrlw $15,%%xmm4 \n"
|
||||
"psllw $1,%%xmm4 \n" // all 2
|
||||
@ -1303,7 +1303,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqw %%xmm6,%%xmm6 \n"
|
||||
"psrlw $15,%%xmm6 \n"
|
||||
"psllw $3,%%xmm6 \n" // all 8
|
||||
@ -1388,7 +1388,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
@ -1432,7 +1432,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrlw $15,%%ymm6,%%ymm6 \n"
|
||||
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
|
||||
@ -1514,7 +1514,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm5 \n"
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
@ -1566,7 +1566,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vbroadcastf128 %5,%%ymm5 \n"
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
@ -1628,7 +1628,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrld $31,%%ymm4,%%ymm4 \n"
|
||||
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
@ -1678,7 +1678,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrld $31,%%ymm6,%%ymm6 \n"
|
||||
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
|
||||
@ -1761,11 +1761,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
||||
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width) {
|
||||
asm volatile (
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
asm volatile("pxor %%xmm5,%%xmm5 \n"
|
||||
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm3 \n"
|
||||
"lea 0x10(%0),%0 \n" // src_ptr += 16
|
||||
@ -1781,11 +1780,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
@ -1793,10 +1792,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
||||
void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width) {
|
||||
asm volatile (
|
||||
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
|
||||
LABELALIGN
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm3 \n"
|
||||
"lea 0x20(%0),%0 \n" // src_ptr += 32
|
||||
@ -1811,11 +1809,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
|
||||
}
|
||||
#endif // HAS_SCALEADDROW_AVX2
|
||||
|
||||
@ -1835,7 +1833,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
|
||||
int x,
|
||||
int dx) {
|
||||
intptr_t x0, x1, temp_pixel;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movd %6,%%xmm2 \n"
|
||||
"movd %7,%%xmm3 \n"
|
||||
"movl $0x04040000,%k2 \n"
|
||||
@ -1932,7 +1930,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
|
||||
int dx) {
|
||||
(void)x;
|
||||
(void)dx;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%1),%%xmm0 \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
@ -1957,7 +1955,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
@ -1979,7 +1977,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
@ -2003,7 +2001,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
@ -2037,7 +2035,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
|
||||
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
||||
intptr_t src_stepx_x12;
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"lea 0x00(,%1,4),%1 \n"
|
||||
"lea 0x00(%1,%1,2),%4 \n"
|
||||
|
||||
@ -2074,7 +2072,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
|
||||
intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
|
||||
intptr_t src_stepx_x12;
|
||||
intptr_t row1 = (intptr_t)(src_stride);
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"lea 0x00(,%1,4),%1 \n"
|
||||
"lea 0x00(%1,%1,2),%4 \n"
|
||||
"lea 0x00(%0,%5,1),%5 \n"
|
||||
@ -2117,7 +2115,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
|
||||
int x,
|
||||
int dx) {
|
||||
intptr_t x0, x1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movd %5,%%xmm2 \n"
|
||||
"movd %6,%%xmm3 \n"
|
||||
"pshufd $0x0,%%xmm2,%%xmm2 \n"
|
||||
@ -2188,7 +2186,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
|
||||
int dx) {
|
||||
(void)x;
|
||||
(void)dx;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%1),%%xmm0 \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
@ -2226,7 +2224,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
|
||||
int x,
|
||||
int dx) {
|
||||
intptr_t x0, x1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm4 \n"
|
||||
"movdqa %1,%%xmm5 \n"
|
||||
:
|
||||
@ -2234,7 +2232,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
|
||||
"m"(kShuffleFractions) // %1
|
||||
);
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movd %5,%%xmm2 \n"
|
||||
"movd %6,%%xmm3 \n"
|
||||
"pcmpeqb %%xmm6,%%xmm6 \n"
|
||||
@ -2297,7 +2295,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
|
||||
|
||||
// Divide num by div and return as 16.16 fixed point result.
|
||||
int FixedDiv_X86(int num, int div) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"cdq \n"
|
||||
"shld $0x10,%%eax,%%edx \n"
|
||||
"shl $0x10,%%eax \n"
|
||||
@ -2311,7 +2309,7 @@ int FixedDiv_X86(int num, int div) {
|
||||
|
||||
// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
|
||||
int FixedDiv1_X86(int num, int div) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"cdq \n"
|
||||
"shld $0x10,%%eax,%%edx \n"
|
||||
"shl $0x10,%%eax \n"
|
||||
@ -2343,7 +2341,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
@ -2383,7 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
|
||||
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
|
||||
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
@ -2427,7 +2425,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
|
||||
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqw %%xmm4,%%xmm4 \n"
|
||||
"psrlw $15,%%xmm4 \n"
|
||||
"psllw $1,%%xmm4 \n" // all 2
|
||||
@ -2468,7 +2466,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pcmpeqw %%xmm6,%%xmm6 \n"
|
||||
"psrlw $15,%%xmm6 \n"
|
||||
"psllw $3,%%xmm6 \n" // all 8
|
||||
@ -2552,7 +2550,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
||||
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
@ -2595,7 +2593,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrlw $15,%%ymm6,%%ymm6 \n"
|
||||
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
|
||||
@ -2675,7 +2673,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
|
||||
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
"pcmpeqd %%xmm4,%%xmm4 \n"
|
||||
"psrld $31,%%xmm4 \n"
|
||||
@ -2727,7 +2725,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"pxor %%xmm7,%%xmm7 \n"
|
||||
"pcmpeqd %%xmm6,%%xmm6 \n"
|
||||
"psrld $31,%%xmm6 \n"
|
||||
@ -2818,7 +2816,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
|
||||
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrld $31,%%ymm4,%%ymm4 \n"
|
||||
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
@ -2867,7 +2865,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrld $31,%%ymm6,%%ymm6 \n"
|
||||
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
|
||||
|
||||
@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
// load even pixels into q0, odd into q1
|
||||
"vld2.8 {q0, q1}, [%0]! \n"
|
||||
@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %0 \n"
|
||||
"1: \n"
|
||||
@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||
@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
||||
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load up 16x4
|
||||
"vld1.8 {q1}, [%3]! \n"
|
||||
@ -155,7 +155,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||
"subs %2, %2, #24 \n"
|
||||
@ -173,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
@ -230,7 +230,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d24, #3 \n"
|
||||
"add %3, %0 \n"
|
||||
"1: \n"
|
||||
@ -282,7 +282,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.8 {q3}, [%3] \n"
|
||||
"1: \n"
|
||||
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
|
||||
@ -306,7 +306,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.16 {q13}, [%5] \n"
|
||||
"vld1.8 {q14}, [%6] \n"
|
||||
"vld1.8 {q15}, [%7] \n"
|
||||
@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vld1.16 {q13}, [%4] \n"
|
||||
"vld1.8 {q14}, [%5] \n"
|
||||
"add %3, %0 \n"
|
||||
@ -509,7 +509,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d30, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -546,7 +546,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
const uint8_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 q15, #3 \n"
|
||||
"vmov.u8 d28, #3 \n"
|
||||
|
||||
@ -608,7 +608,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 q15, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -644,7 +644,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 q15, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -695,7 +695,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 d31, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 d31, #3 \n"
|
||||
"vmov.u32 q14, #3 \n"
|
||||
|
||||
@ -791,7 +791,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_temp = src_ptr + 2;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u8 d30, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -828,7 +828,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src_temp = src_ptr + 2;
|
||||
const uint8_t* src_temp1 = src_ptr1 + 2;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 q15, #3 \n"
|
||||
"vmov.u8 d28, #3 \n"
|
||||
|
||||
@ -890,7 +890,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 d30, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -935,7 +935,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 2;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"vmov.u16 d30, #3 \n"
|
||||
"vmov.u32 q14, #3 \n"
|
||||
|
||||
@ -988,7 +988,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
|
||||
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
|
||||
@ -1086,7 +1086,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||
@ -1114,7 +1114,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||
@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
@ -1174,7 +1174,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"mov r12, %3, lsl #2 \n"
|
||||
"1: \n"
|
||||
"vld1.32 {d0[0]}, [%0], r12 \n"
|
||||
@ -1198,7 +1198,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
|
||||
int src_stepx,
|
||||
uint8_t* dst_argb,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"mov r12, %4, lsl #2 \n"
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
@ -1246,7 +1246,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
|
||||
int dx) {
|
||||
int tmp;
|
||||
const uint8_t* src_tmp = src_argb;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
// clang-format off
|
||||
LOAD1_DATA32_LANE(d0, 0)
|
||||
@ -1349,7 +1349,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||
@ -1368,7 +1368,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||
@ -1387,7 +1387,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
@ -1422,7 +1422,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
|
||||
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.16 {d0[0]}, [%0], %6 \n"
|
||||
"vld1.16 {d0[1]}, [%1], %6 \n"
|
||||
|
||||
@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
// load even pixels into v0, odd into v1
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
// load even pixels into v0, odd into v1
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
@ -172,18 +172,18 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"ld1 {v29.16b}, [%[kShuf34_0]] \n"
|
||||
"ld1 {v30.16b}, [%[kShuf34_1]] \n"
|
||||
"ld1 {v31.16b}, [%[kShuf34_2]] \n"
|
||||
"1: \n"
|
||||
"ld1 {v29.16b}, [%[kShuf34_0]] \n"
|
||||
"ld1 {v30.16b}, [%[kShuf34_1]] \n"
|
||||
"ld1 {v31.16b}, [%[kShuf34_2]] \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n"
|
||||
"subs %w[width], %w[width], #48 \n"
|
||||
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n"
|
||||
"tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n"
|
||||
"tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n"
|
||||
"st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
|
||||
"b.gt 1b \n"
|
||||
"subs %w[width], %w[width], #48 \n"
|
||||
"tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n"
|
||||
"tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n"
|
||||
"tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n"
|
||||
"st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
|
||||
[width] "+r"(dst_width) // %[width]
|
||||
@ -326,7 +326,7 @@ static const vec16 kMult38_Div664 = {
|
||||
65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0};
|
||||
static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12,
|
||||
65536 / 18, 65536 / 18, 65536 / 12,
|
||||
0, 0};
|
||||
0, 0};
|
||||
|
||||
// 32 -> 12
|
||||
void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
||||
@ -335,26 +335,26 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"ld1 {v3.16b}, [%[kShuf38]] \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"b.eq 2f \n"
|
||||
"ld1 {v3.16b}, [%[kShuf38]] \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"b.eq 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[src_ptr]], #32 \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead
|
||||
"str q2, [%[dst_ptr]] \n"
|
||||
"add %[dst_ptr], %[dst_ptr], #12 \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[src_ptr]], #32 \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead
|
||||
"str q2, [%[dst_ptr]] \n"
|
||||
"add %[dst_ptr], %[dst_ptr], #12 \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
// Store exactly 12 bytes on the final iteration to avoid writing past
|
||||
// the end of the array.
|
||||
"2: \n"
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
|
||||
"st1 {v2.8b}, [%[dst_ptr]], #8 \n"
|
||||
"st1 {v2.s}[2], [%[dst_ptr]] \n"
|
||||
"2: \n"
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n"
|
||||
"st1 {v2.8b}, [%[dst_ptr]], #8 \n"
|
||||
"st1 {v2.s}[2], [%[dst_ptr]] \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
|
||||
[width] "+r"(dst_width) // %[width]
|
||||
@ -378,49 +378,49 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
||||
asm volatile(
|
||||
"ld1 {v27.16b}, [%[tblArray1]] \n"
|
||||
"ld1 {v28.16b}, [%[tblArray2]] \n"
|
||||
"ld1 {v29.16b}, [%[tblArray3]] \n"
|
||||
"ld1 {v31.16b}, [%[tblArray4]] \n"
|
||||
"ld1 {v30.16b}, [%[div996]] \n"
|
||||
"ld1 {v27.16b}, [%[tblArray1]] \n"
|
||||
"ld1 {v28.16b}, [%[tblArray2]] \n"
|
||||
"ld1 {v29.16b}, [%[tblArray3]] \n"
|
||||
"ld1 {v31.16b}, [%[tblArray4]] \n"
|
||||
"ld1 {v30.16b}, [%[div996]] \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q20, q0, [%[src_ptr]], #32 \n"
|
||||
"ldp q21, q1, [%[src_ptr1]], #32 \n"
|
||||
"ldp q22, q2, [%[src_ptr2]], #32 \n"
|
||||
"1: \n"
|
||||
"ldp q20, q0, [%[src_ptr]], #32 \n"
|
||||
"ldp q21, q1, [%[src_ptr1]], #32 \n"
|
||||
"ldp q22, q2, [%[src_ptr2]], #32 \n"
|
||||
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
|
||||
// Add across strided rows first.
|
||||
"uaddl v23.8h, v20.8b, v21.8b \n"
|
||||
"uaddl v3.8h, v0.8b, v1.8b \n"
|
||||
"uaddl2 v24.8h, v20.16b, v21.16b \n"
|
||||
"uaddl2 v4.8h, v0.16b, v1.16b \n"
|
||||
"uaddl v23.8h, v20.8b, v21.8b \n"
|
||||
"uaddl v3.8h, v0.8b, v1.8b \n"
|
||||
"uaddl2 v24.8h, v20.16b, v21.16b \n"
|
||||
"uaddl2 v4.8h, v0.16b, v1.16b \n"
|
||||
|
||||
"uaddw v23.8h, v23.8h, v22.8b \n"
|
||||
"uaddw v3.8h, v3.8h, v2.8b \n"
|
||||
"uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ...
|
||||
"uaddw2 v4.8h, v4.8h, v2.16b \n"
|
||||
"uaddw v23.8h, v23.8h, v22.8b \n"
|
||||
"uaddw v3.8h, v3.8h, v2.8b \n"
|
||||
"uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ...
|
||||
"uaddw2 v4.8h, v4.8h, v2.16b \n"
|
||||
|
||||
// Permute groups of {three,three,two} into separate vectors to sum.
|
||||
"tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ...
|
||||
"tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n"
|
||||
"tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ...
|
||||
"tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n"
|
||||
"tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0...
|
||||
"tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n"
|
||||
"tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ...
|
||||
"tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n"
|
||||
"tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ...
|
||||
"tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n"
|
||||
"tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0...
|
||||
"tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n"
|
||||
|
||||
"add v23.8h, v20.8h, v21.8h \n"
|
||||
"add v3.8h, v0.8h, v1.8h \n"
|
||||
"add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h
|
||||
"add v4.8h, v3.8h, v2.8h \n"
|
||||
"add v23.8h, v20.8h, v21.8h \n"
|
||||
"add v3.8h, v0.8h, v1.8h \n"
|
||||
"add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h
|
||||
"add v4.8h, v3.8h, v2.8h \n"
|
||||
|
||||
"sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6}
|
||||
"sqrdmulh v25.8h, v4.8h, v30.8h \n"
|
||||
"tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow.
|
||||
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
|
||||
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
"sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6}
|
||||
"sqrdmulh v25.8h, v4.8h, v30.8h \n"
|
||||
"tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow.
|
||||
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
|
||||
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
|
||||
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
|
||||
@ -446,41 +446,41 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
asm volatile(
|
||||
"ld1 {v28.16b}, [%[tblArray1]] \n"
|
||||
"ld1 {v29.16b}, [%[tblArray2]] \n"
|
||||
"ld1 {v31.16b}, [%[tblArray3]] \n"
|
||||
"ld1 {v30.8h}, [%[div664]] \n"
|
||||
"ld1 {v28.16b}, [%[tblArray1]] \n"
|
||||
"ld1 {v29.16b}, [%[tblArray2]] \n"
|
||||
"ld1 {v31.16b}, [%[tblArray3]] \n"
|
||||
"ld1 {v30.8h}, [%[div664]] \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
|
||||
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
"1: \n"
|
||||
"ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ...
|
||||
"ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ...
|
||||
"subs %w[width], %w[width], #12 \n"
|
||||
|
||||
// Permute into groups of six values (three pairs) to be summed.
|
||||
"tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ...
|
||||
"tbl v2.16b, {v0.16b}, v28.16b \n"
|
||||
"tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ...
|
||||
"tbl v3.16b, {v1.16b}, v28.16b \n"
|
||||
"tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ...
|
||||
"tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n"
|
||||
"tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ...
|
||||
"tbl v2.16b, {v0.16b}, v28.16b \n"
|
||||
"tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ...
|
||||
"tbl v3.16b, {v1.16b}, v28.16b \n"
|
||||
"tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ...
|
||||
"tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n"
|
||||
|
||||
"uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ...
|
||||
"uaddlp v2.8h, v2.16b \n"
|
||||
"uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ...
|
||||
"uaddlp v3.8h, v3.16b \n"
|
||||
"uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ...
|
||||
"uaddlp v4.8h, v4.16b \n"
|
||||
"add v20.8h, v22.8h, v23.8h \n"
|
||||
"add v0.8h, v2.8h, v3.8h \n"
|
||||
"add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ...
|
||||
"add v1.8h, v0.8h, v4.8h \n"
|
||||
"uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ...
|
||||
"uaddlp v2.8h, v2.16b \n"
|
||||
"uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ...
|
||||
"uaddlp v3.8h, v3.16b \n"
|
||||
"uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ...
|
||||
"uaddlp v4.8h, v4.16b \n"
|
||||
"add v20.8h, v22.8h, v23.8h \n"
|
||||
"add v0.8h, v2.8h, v3.8h \n"
|
||||
"add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ...
|
||||
"add v1.8h, v0.8h, v4.8h \n"
|
||||
|
||||
"sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4}
|
||||
"sqrdmulh v22.8h, v1.8h, v30.8h \n"
|
||||
"tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow.
|
||||
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
|
||||
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
"sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4}
|
||||
"sqrdmulh v22.8h, v1.8h, v30.8h \n"
|
||||
"tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow.
|
||||
"st1 {v21.d}[0], [%[dst_ptr]], #8 \n"
|
||||
"st1 {v21.s}[2], [%[dst_ptr]], #4 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst_ptr), // %[dst_ptr]
|
||||
[src_ptr1] "+r"(src_ptr1), // %[src_ptr1]
|
||||
@ -543,7 +543,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
const uint8_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8b, #3 \n"
|
||||
"movi v30.8h, #3 \n"
|
||||
|
||||
@ -599,7 +599,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -636,7 +636,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -690,7 +690,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -735,7 +735,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.4h, #3 \n"
|
||||
"movi v30.4s, #3 \n"
|
||||
|
||||
@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_temp = src_ptr + 2;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8b, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -829,7 +829,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src_temp = src_ptr + 2;
|
||||
const uint8_t* src_temp1 = src_ptr1 + 2;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8b, #3 \n"
|
||||
"movi v30.8h, #3 \n"
|
||||
|
||||
@ -885,7 +885,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -932,7 +932,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 2;
|
||||
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"movi v31.4h, #3 \n"
|
||||
"movi v30.4s, #3 \n"
|
||||
|
||||
@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
|
||||
@ -1043,14 +1043,14 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
|
||||
"trn1 v21.8h, v2.8h, v0.8h \n"
|
||||
|
||||
"1: \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
|
||||
"ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v6.h}[7], [%[tmp_ptr]] \n"
|
||||
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop
|
||||
"trn1 v4.16b, v6.16b, v0.16b \n"
|
||||
@ -1090,14 +1090,14 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n"
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"uzp2 v0.4s, v0.4s, v1.4s \n"
|
||||
"uzp2 v1.4s, v2.4s, v3.4s \n"
|
||||
"st1 {v0.4s, v1.4s}, [%[dst]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"uzp2 v0.4s, v0.4s, v1.4s \n"
|
||||
"uzp2 v1.4s, v2.4s, v3.4s \n"
|
||||
"st1 {v0.4s, v1.4s}, [%[dst]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_ptr), // %[src]
|
||||
[dst] "+r"(dst), // %[dst]
|
||||
[width] "+r"(dst_width) // %[width]
|
||||
@ -1113,15 +1113,15 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1 = src_argb + 32;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.4s, v1.4s}, [%[src]] \n"
|
||||
"add %[src], %[src], #64 \n"
|
||||
"ld2 {v2.4s, v3.4s}, [%[src1]] \n"
|
||||
"add %[src1], %[src1], #64 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v1.16b, v2.16b, v3.16b \n"
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"st1 {v0.16b, v1.16b}, [%[dst]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
"ld2 {v0.4s, v1.4s}, [%[src]] \n"
|
||||
"add %[src], %[src], #64 \n"
|
||||
"ld2 {v2.4s, v3.4s}, [%[src1]] \n"
|
||||
"add %[src1], %[src1], #64 \n"
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n"
|
||||
"urhadd v1.16b, v2.16b, v3.16b \n"
|
||||
"subs %w[width], %w[width], #8 \n"
|
||||
"st1 {v0.16b, v1.16b}, [%[dst]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_argb), // %[src]
|
||||
[src1] "+r"(src_argb1), // %[src1]
|
||||
[dst] "+r"(dst_argb), // %[dst]
|
||||
@ -1135,21 +1135,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
|
||||
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
|
||||
"uaddl v2.8h, v0.8b, v1.8b \n"
|
||||
"uaddl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"uaddl v22.8h, v20.8b, v21.8b \n"
|
||||
"uaddl2 v23.8h, v20.16b, v21.16b \n"
|
||||
"add v0.8h, v2.8h, v22.8h \n"
|
||||
"add v1.8h, v3.8h, v23.8h \n"
|
||||
"rshrn v0.8b, v0.8h, #2 \n"
|
||||
"rshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w[width], %w[width], #4 \n"
|
||||
"stp d0, d1, [%[dst]], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
|
||||
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
|
||||
"uaddl v2.8h, v0.8b, v1.8b \n"
|
||||
"uaddl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"uaddl v22.8h, v20.8b, v21.8b \n"
|
||||
"uaddl2 v23.8h, v20.16b, v21.16b \n"
|
||||
"add v0.8h, v2.8h, v22.8h \n"
|
||||
"add v1.8h, v3.8h, v23.8h \n"
|
||||
"rshrn v0.8b, v0.8h, #2 \n"
|
||||
"rshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w[width], %w[width], #4 \n"
|
||||
"stp d0, d1, [%[dst]], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst),
|
||||
[width] "+r"(dst_width)
|
||||
:
|
||||
@ -1166,26 +1166,22 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb3 = src_argb + src_stepx * 12;
|
||||
int64_t i = 0;
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"ldr w10, [%[src], %[i]] \n"
|
||||
"ldr w11, [%[src1], %[i]] \n"
|
||||
"ldr w12, [%[src2], %[i]] \n"
|
||||
"ldr w13, [%[src3], %[i]] \n"
|
||||
"add %[i], %[i], %[step] \n"
|
||||
"subs %w[width], %w[width], #4 \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"stp w10, w11, [%[dst]], #8 \n"
|
||||
"stp w12, w13, [%[dst]], #8 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src]"+r"(src_argb),
|
||||
[src1]"+r"(src_argb1),
|
||||
[src2]"+r"(src_argb2),
|
||||
[src3]"+r"(src_argb3),
|
||||
[dst]"+r"(dst_argb),
|
||||
[width]"+r"(dst_width),
|
||||
[i]"+r"(i)
|
||||
: [step]"r"((int64_t)(src_stepx * 16))
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ldr w10, [%[src], %[i]] \n"
|
||||
"ldr w11, [%[src1], %[i]] \n"
|
||||
"ldr w12, [%[src2], %[i]] \n"
|
||||
"ldr w13, [%[src3], %[i]] \n"
|
||||
"add %[i], %[i], %[step] \n"
|
||||
"subs %w[width], %w[width], #4 \n"
|
||||
"prfm pldl1keep, [%[src], 448] \n"
|
||||
"stp w10, w11, [%[dst]], #8 \n"
|
||||
"stp w12, w13, [%[dst]], #8 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_argb), [src1] "+r"(src_argb1), [src2] "+r"(src_argb2),
|
||||
[src3] "+r"(src_argb3), [dst] "+r"(dst_argb), [width] "+r"(dst_width),
|
||||
[i] "+r"(i)
|
||||
: [step] "r"((int64_t)(src_stepx * 16))
|
||||
: "memory", "cc", "w10", "w11", "w12", "w13");
|
||||
}
|
||||
|
||||
@ -1312,33 +1308,33 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
|
||||
|
||||
"1: \n" //
|
||||
SCALE_ARGB_FILTER_COLS_STEP_ADDR
|
||||
"ldr d1, [%6] \n" //
|
||||
"ldr d1, [%6] \n" //
|
||||
SCALE_ARGB_FILTER_COLS_STEP_ADDR
|
||||
"ldr d2, [%6] \n"
|
||||
"shrn v4.4h, v5.4s, #9 \n" //
|
||||
"ldr d2, [%6] \n"
|
||||
"shrn v4.4h, v5.4s, #9 \n" //
|
||||
SCALE_ARGB_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v1.d}[1], [%6] \n" //
|
||||
"ld1 {v1.d}[1], [%6] \n" //
|
||||
SCALE_ARGB_FILTER_COLS_STEP_ADDR
|
||||
"ld1 {v2.d}[1], [%6] \n"
|
||||
"ld1 {v2.d}[1], [%6] \n"
|
||||
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"and v4.8b, v4.8b, v3.8b \n"
|
||||
"trn1 v0.4s, v1.4s, v2.4s \n"
|
||||
"tbl v4.16b, {v4.16b}, v18.16b \n" // f
|
||||
"trn2 v1.4s, v1.4s, v2.4s \n"
|
||||
"eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f
|
||||
"subs %w2, %w2, #4 \n" // 4 processed per loop
|
||||
"and v4.8b, v4.8b, v3.8b \n"
|
||||
"trn1 v0.4s, v1.4s, v2.4s \n"
|
||||
"tbl v4.16b, {v4.16b}, v18.16b \n" // f
|
||||
"trn2 v1.4s, v1.4s, v2.4s \n"
|
||||
"eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f
|
||||
|
||||
"umull v16.8h, v1.8b, v4.8b \n"
|
||||
"umull2 v17.8h, v1.16b, v4.16b \n"
|
||||
"umlal v16.8h, v0.8b, v7.8b \n"
|
||||
"umlal2 v17.8h, v0.16b, v7.16b \n"
|
||||
"umull v16.8h, v1.8b, v4.8b \n"
|
||||
"umull2 v17.8h, v1.16b, v4.16b \n"
|
||||
"umlal v16.8h, v0.8b, v7.8b \n"
|
||||
"umlal2 v17.8h, v0.16b, v7.16b \n"
|
||||
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"shrn v0.8b, v16.8h, #7 \n"
|
||||
"shrn v1.8b, v17.8h, #7 \n"
|
||||
"add v5.4s, v5.4s, v6.4s \n"
|
||||
"stp d0, d1, [%0], #16 \n" // store pixels
|
||||
"b.gt 1b \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"shrn v0.8b, v16.8h, #7 \n"
|
||||
"shrn v1.8b, v17.8h, #7 \n"
|
||||
"add v5.4s, v5.4s, v6.4s \n"
|
||||
"stp d0, d1, [%0], #16 \n" // store pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(dst_argb), // %0
|
||||
"+r"(src_argb), // %1
|
||||
"+r"(dst_width), // %2
|
||||
@ -1360,34 +1356,34 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"subs %w[dst_width], %w[dst_width], #32 \n"
|
||||
"b.lt 2f \n"
|
||||
"subs %w[dst_width], %w[dst_width], #32 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"ldp q2, q3, [%[src_ptr], #32] \n"
|
||||
"ldp q4, q5, [%[src_ptr], #64] \n"
|
||||
"ldp q6, q7, [%[src_ptr], #96] \n"
|
||||
"add %[src_ptr], %[src_ptr], #128 \n"
|
||||
"uzp2 v0.8h, v0.8h, v1.8h \n"
|
||||
"uzp2 v1.8h, v2.8h, v3.8h \n"
|
||||
"uzp2 v2.8h, v4.8h, v5.8h \n"
|
||||
"uzp2 v3.8h, v6.8h, v7.8h \n"
|
||||
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
|
||||
"stp q0, q1, [%[dst_ptr]] \n"
|
||||
"stp q2, q3, [%[dst_ptr], #32] \n"
|
||||
"add %[dst_ptr], %[dst_ptr], #64 \n"
|
||||
"b.ge 1b \n"
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"ldp q2, q3, [%[src_ptr], #32] \n"
|
||||
"ldp q4, q5, [%[src_ptr], #64] \n"
|
||||
"ldp q6, q7, [%[src_ptr], #96] \n"
|
||||
"add %[src_ptr], %[src_ptr], #128 \n"
|
||||
"uzp2 v0.8h, v0.8h, v1.8h \n"
|
||||
"uzp2 v1.8h, v2.8h, v3.8h \n"
|
||||
"uzp2 v2.8h, v4.8h, v5.8h \n"
|
||||
"uzp2 v3.8h, v6.8h, v7.8h \n"
|
||||
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
|
||||
"stp q0, q1, [%[dst_ptr]] \n"
|
||||
"stp q2, q3, [%[dst_ptr], #32] \n"
|
||||
"add %[dst_ptr], %[dst_ptr], #64 \n"
|
||||
"b.ge 1b \n"
|
||||
|
||||
"2: \n"
|
||||
"adds %w[dst_width], %w[dst_width], #32 \n"
|
||||
"b.eq 99f \n"
|
||||
"adds %w[dst_width], %w[dst_width], #32 \n"
|
||||
"b.eq 99f \n"
|
||||
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"ldp q2, q3, [%[src_ptr], #32] \n"
|
||||
"uzp2 v0.8h, v0.8h, v1.8h \n"
|
||||
"uzp2 v1.8h, v2.8h, v3.8h \n"
|
||||
"stp q0, q1, [%[dst_ptr]] \n"
|
||||
"ldp q0, q1, [%[src_ptr]] \n"
|
||||
"ldp q2, q3, [%[src_ptr], #32] \n"
|
||||
"uzp2 v0.8h, v0.8h, v1.8h \n"
|
||||
"uzp2 v1.8h, v2.8h, v3.8h \n"
|
||||
"stp q0, q1, [%[dst_ptr]] \n"
|
||||
|
||||
"99: \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
@ -1403,15 +1399,15 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
|
||||
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
|
||||
"subs %w[dst_width], %w[dst_width], #16 \n"
|
||||
"urhadd v0.8h, v0.8h, v1.8h \n"
|
||||
"urhadd v1.8h, v2.8h, v3.8h \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n"
|
||||
"stp q0, q1, [%[dst_ptr]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n"
|
||||
"ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n"
|
||||
"subs %w[dst_width], %w[dst_width], #16 \n"
|
||||
"urhadd v0.8h, v0.8h, v1.8h \n"
|
||||
"urhadd v1.8h, v2.8h, v3.8h \n"
|
||||
"prfm pldl1keep, [%[src_ptr], 448] \n"
|
||||
"stp q0, q1, [%[dst_ptr]], #32 \n"
|
||||
"b.gt 1b \n"
|
||||
: [src_ptr] "+r"(src_ptr), // %[src_ptr]
|
||||
[dst_ptr] "+r"(dst), // %[dst_ptr]
|
||||
[dst_width] "+r"(dst_width) // %[dst_width]
|
||||
@ -1424,7 +1420,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
|
||||
"1: \n"
|
||||
@ -1455,7 +1451,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
@ -1474,7 +1470,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
@ -1493,7 +1489,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
// change the stride to row 2 pointer
|
||||
"add %1, %1, %0 \n"
|
||||
"1: \n"
|
||||
@ -1528,7 +1524,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
|
||||
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
|
||||
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
|
||||
(void)src_stride;
|
||||
asm volatile (
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.h}[0], [%0], %6 \n"
|
||||
"ld1 {v1.h}[0], [%1], %6 \n"
|
||||
|
||||
@ -10,8 +10,8 @@
|
||||
|
||||
#include "libyuv/scale.h" /* For FilterMode */
|
||||
|
||||
#include <limits.h>
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -41,9 +41,9 @@ int RGBScale(const uint8_t* src_rgb,
|
||||
int dst_height,
|
||||
enum FilterMode filtering) {
|
||||
int r;
|
||||
if (!src_rgb || !dst_rgb ||
|
||||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 ||
|
||||
dst_width <= 0 || dst_width > INT_MAX / 4 || dst_height <= 0) {
|
||||
if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
|
||||
src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
|
||||
dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
const int abs_src_height = (src_height < 0) ? -src_height : src_height;
|
||||
|
||||
@ -149,7 +149,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
|
||||
const uint32_t* src = (const uint32_t*)(src_argb);
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m4_t v_odd, v_even, v_dst;
|
||||
vuint32m4_t v_odd_32, v_even_32;
|
||||
@ -214,7 +214,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
|
||||
const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
|
||||
vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
|
||||
@ -311,7 +311,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
|
||||
const int stride_byte = src_stepx * 4;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
|
||||
vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
|
||||
@ -389,7 +389,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
|
||||
(void)src_stride;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m4_t v_s0, v_s1, v_dst;
|
||||
size_t vl = __riscv_vsetvl_e8m4(w);
|
||||
@ -444,7 +444,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
|
||||
size_t w = (size_t)dst_width;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
size_t vl = __riscv_vsetvl_e8m4(w);
|
||||
vuint8m4_t v_s0, v_s1, v_t0, v_t1;
|
||||
@ -577,7 +577,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
|
||||
size_t w = (size_t)dst_width;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||
vuint8m2_t v_t0, v_t1, v_t2, v_t3;
|
||||
@ -747,7 +747,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
|
||||
const uint8_t* t = src_ptr + src_stride;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
|
||||
@ -876,7 +876,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
|
||||
const uint8_t* t = src_ptr + src_stride;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
|
||||
@ -1539,7 +1539,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
|
||||
(void)src_stride;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m4_t v_u0v0, v_u1v1, v_avg;
|
||||
vuint16m4_t v_u0v0_16, v_u1v1_16;
|
||||
@ -1608,7 +1608,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
|
||||
size_t w = (size_t)dst_width;
|
||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||
// register) is set to round-to-nearest-up mode(0).
|
||||
asm volatile ("csrwi vxrm, 0");
|
||||
asm volatile("csrwi vxrm, 0");
|
||||
do {
|
||||
vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
|
||||
vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
|
||||
|
||||
@ -15,7 +15,6 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
|
||||
defined(__aarch64__)
|
||||
|
||||
|
||||
@ -333,14 +333,14 @@ static void ScaleUVDownEven(int src_width,
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV) && !filtering) {
|
||||
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
|
||||
ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN4_RVV)
|
||||
if (col_step == 4) {
|
||||
ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
|
||||
ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN4_RVV)
|
||||
if (col_step == 4) {
|
||||
ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
#include "../unit_test/unit_test.h"
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/compare.h"
|
||||
#include "libyuv/convert.h"
|
||||
@ -19,7 +20,6 @@
|
||||
#include "libyuv/convert_from.h"
|
||||
#include "libyuv/convert_from_argb.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "../unit_test/unit_test.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/video_common.h"
|
||||
|
||||
@ -67,16 +67,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
|
||||
#endif
|
||||
|
||||
#ifdef __linux__
|
||||
static void KernelVersion(int *version) {
|
||||
static void KernelVersion(int* version) {
|
||||
struct utsname buffer;
|
||||
int i = 0;
|
||||
|
||||
version[0] = version[1] = 0;
|
||||
if (uname(&buffer) == 0) {
|
||||
char *v = buffer.release;
|
||||
char* v = buffer.release;
|
||||
for (i = 0; *v && i < 2; ++v) {
|
||||
if (isdigit(*v)) {
|
||||
version[i++] = (int) strtol(v, &v, 10);
|
||||
version[i++] = (int)strtol(v, &v, 10);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -142,8 +142,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
|
||||
|
||||
// Read and print the RVV vector length.
|
||||
if (has_rvv) {
|
||||
register uint32_t vlenb __asm__ ("t0");
|
||||
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb));
|
||||
register uint32_t vlenb __asm__("t0");
|
||||
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
|
||||
printf("RVV vector length: %d bytes\n", vlenb);
|
||||
}
|
||||
}
|
||||
@ -161,7 +161,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
|
||||
#if defined(__loongarch__)
|
||||
int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH);
|
||||
if (has_loongarch) {
|
||||
int has_lsx = TestCpuFlag(kCpuHasLSX);
|
||||
int has_lsx = TestCpuFlag(kCpuHasLSX);
|
||||
int has_lasx = TestCpuFlag(kCpuHasLASX);
|
||||
printf("Has LOONGARCH 0x%x\n", has_loongarch);
|
||||
printf("Has LSX 0x%x\n", has_lsx);
|
||||
@ -169,8 +169,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
|
||||
}
|
||||
#endif // defined(__loongarch__)
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_X64)
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
|
||||
defined(_M_X64)
|
||||
int has_x86 = TestCpuFlag(kCpuHasX86);
|
||||
if (has_x86) {
|
||||
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
|
||||
@ -215,7 +215,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
|
||||
printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
|
||||
printf("Has AMXINT8 0x%x\n", has_amxint8);
|
||||
}
|
||||
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
|
||||
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
|
||||
// defined(_M_X64)
|
||||
}
|
||||
|
||||
TEST_F(LibYUVBaseTest, TestCompilerMacros) {
|
||||
|
||||
@ -1570,18 +1570,21 @@ static int TestCopyPlane(int benchmark_width,
|
||||
// Disable all optimizations.
|
||||
MaskCpuFlags(disable_cpu_flags);
|
||||
for (int i = 0; i < benchmark_iterations; i++) {
|
||||
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert);
|
||||
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width,
|
||||
benchmark_width, benchmark_height * invert);
|
||||
}
|
||||
|
||||
// Enable optimizations.
|
||||
MaskCpuFlags(benchmark_cpu_info);
|
||||
for (int i = 0; i < benchmark_iterations; i++) {
|
||||
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert);
|
||||
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width,
|
||||
benchmark_width, benchmark_height * invert);
|
||||
}
|
||||
|
||||
int max_diff = 0;
|
||||
for (int i = 0; i < y_plane_size; ++i) {
|
||||
int abs_diff = abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
|
||||
int abs_diff =
|
||||
abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
|
||||
if (abs_diff > max_diff) {
|
||||
max_diff = abs_diff;
|
||||
}
|
||||
@ -1596,29 +1599,29 @@ static int TestCopyPlane(int benchmark_width,
|
||||
|
||||
TEST_F(LibYUVPlanarTest, CopyPlane_Any) {
|
||||
int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, +1, 0);
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, +1, 0);
|
||||
EXPECT_LE(max_diff, 0);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) {
|
||||
int max_diff =
|
||||
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
|
||||
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
|
||||
EXPECT_LE(max_diff, 0);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, CopyPlane_Invert) {
|
||||
int max_diff =
|
||||
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
|
||||
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
|
||||
EXPECT_LE(max_diff, 0);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, CopyPlane_Opt) {
|
||||
int max_diff =
|
||||
TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
|
||||
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
|
||||
EXPECT_LE(max_diff, 0);
|
||||
}
|
||||
|
||||
@ -2499,17 +2502,19 @@ static int TestHalfFloatPlane(int benchmark_width,
|
||||
// Disable all optimizations.
|
||||
MaskCpuFlags(disable_cpu_flags);
|
||||
for (j = 0; j < benchmark_iterations; j++) {
|
||||
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
|
||||
reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2,
|
||||
scale, benchmark_width, benchmark_height * invert);
|
||||
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
|
||||
benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_c),
|
||||
benchmark_width * 2, scale, benchmark_width,
|
||||
benchmark_height * invert);
|
||||
}
|
||||
|
||||
// Enable optimizations.
|
||||
MaskCpuFlags(benchmark_cpu_info);
|
||||
for (j = 0; j < benchmark_iterations; j++) {
|
||||
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2,
|
||||
reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2,
|
||||
scale, benchmark_width, benchmark_height * invert);
|
||||
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
|
||||
benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_opt),
|
||||
benchmark_width * 2, scale, benchmark_width,
|
||||
benchmark_height * invert);
|
||||
}
|
||||
|
||||
int max_diff = 0;
|
||||
@ -2536,23 +2541,23 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
@ -2564,59 +2569,57 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
#if defined(__arm__)
|
||||
static void EnableFlushDenormalToZero(void) {
|
||||
uint32_t cw;
|
||||
asm volatile (
|
||||
"vmrs %0, fpscr \n"
|
||||
"orr %0, %0, #0x1000000 \n"
|
||||
"vmsr fpscr, %0 \n"
|
||||
: "=r"(cw)
|
||||
::"memory", "cc"); // Clobber List
|
||||
asm volatile(
|
||||
"vmrs %0, fpscr \n"
|
||||
"orr %0, %0, #0x1000000 \n"
|
||||
"vmsr fpscr, %0 \n"
|
||||
: "=r"(cw)::"memory", "cc"); // Clobber List
|
||||
}
|
||||
|
||||
static void DisableFlushDenormalToZero(void) {
|
||||
uint32_t cw;
|
||||
asm volatile (
|
||||
"vmrs %0, fpscr \n"
|
||||
"bic %0, %0, #0x1000000 \n"
|
||||
"vmsr fpscr, %0 \n"
|
||||
: "=r"(cw)
|
||||
::"memory", "cc"); // Clobber List
|
||||
asm volatile(
|
||||
"vmrs %0, fpscr \n"
|
||||
"bic %0, %0, #0x1000000 \n"
|
||||
"vmsr fpscr, %0 \n"
|
||||
: "=r"(cw)::"memory", "cc"); // Clobber List
|
||||
}
|
||||
|
||||
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
|
||||
@ -2626,18 +2629,18 @@ static void DisableFlushDenormalToZero(void) {
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
|
||||
// 32 bit arm rounding on denormal case is off by 1 compared to C.
|
||||
EnableFlushDenormalToZero();
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
|
||||
DisableFlushDenormalToZero();
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
|
||||
EnableFlushDenormalToZero();
|
||||
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
|
||||
int diff = TestHalfFloatPlane(
|
||||
benchmark_width_, benchmark_height_, benchmark_iterations_,
|
||||
disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
|
||||
DisableFlushDenormalToZero();
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
@ -3184,8 +3187,9 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
|
||||
tmp_pixels_c_b, benchmark_width_, benchmark_width_,
|
||||
benchmark_height_);
|
||||
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
|
||||
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c,
|
||||
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
|
||||
benchmark_width_, tmp_pixels_c_b, benchmark_width_,
|
||||
dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
|
||||
benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r,
|
||||
@ -3244,8 +3248,9 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
|
||||
tmp_pixels_c_b, benchmark_width_, benchmark_width_,
|
||||
benchmark_height_);
|
||||
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
|
||||
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c,
|
||||
benchmark_width_ * 3, benchmark_width_, benchmark_height_);
|
||||
benchmark_width_, tmp_pixels_c_b, benchmark_width_,
|
||||
dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
|
||||
benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
@ -3446,8 +3451,8 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
|
||||
benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0,
|
||||
dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
|
||||
benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL,
|
||||
0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
|
||||
benchmark_height_);
|
||||
}
|
||||
|
||||
@ -3502,8 +3507,8 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r,
|
||||
benchmark_width_, tmp_pixels_opt_g, benchmark_width_,
|
||||
tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_,
|
||||
benchmark_height_);
|
||||
tmp_pixels_opt_b, benchmark_width_, NULL, 0,
|
||||
benchmark_width_, benchmark_height_);
|
||||
}
|
||||
|
||||
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
|
||||
|
||||
@ -320,16 +320,16 @@ TEST_FACTOR(3, 1, 3)
|
||||
|
||||
#ifndef DISABLE_SLOW_TESTS
|
||||
// Test scale to a specified size with all 4 filters.
|
||||
#define TEST_SCALETO(name, width, height) \
|
||||
TEST_SCALETO1(, name, width, height, None, 0) \
|
||||
TEST_SCALETO1(, name, width, height, Linear, 3) \
|
||||
#define TEST_SCALETO(name, width, height) \
|
||||
TEST_SCALETO1(, name, width, height, None, 0) \
|
||||
TEST_SCALETO1(, name, width, height, Linear, 3) \
|
||||
TEST_SCALETO1(, name, width, height, Bilinear, 3) \
|
||||
TEST_SCALETO1(, name, width, height, Box, 3)
|
||||
#else
|
||||
#if defined(ENABLE_FULL_TESTS)
|
||||
#define TEST_SCALETO(name, width, height) \
|
||||
TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
|
||||
TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
|
||||
#define TEST_SCALETO(name, width, height) \
|
||||
TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \
|
||||
TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \
|
||||
TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \
|
||||
TEST_SCALETO1(DISABLED_, name, width, height, Box, 3)
|
||||
#else
|
||||
|
||||
@ -1058,7 +1058,7 @@ TEST_SCALETO(Scale, 320, 240)
|
||||
TEST_SCALETO(Scale, 1280, 720)
|
||||
TEST_SCALETO(Scale, 1920, 1080)
|
||||
TEST_SCALETO(Scale, 1080, 1920) // for rotated phones
|
||||
#endif // DISABLE_SLOW_TESTS
|
||||
#endif // DISABLE_SLOW_TESTS
|
||||
#undef TEST_SCALETO1
|
||||
#undef TEST_SCALETO
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user