mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Add volatile for gcc inline to avoid being removed
Bug: b/42280943 Change-Id: I4439077a92ffa6dff91d2d10accd5251b76f7544 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5671187 Reviewed-by: David Gao <davidgao@google.com>
This commit is contained in:
parent
efd164d64e
commit
616bee5420
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||||
Version: 1888
|
Version: 1889
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
Shipped: yes
|
Shipped: yes
|
||||||
|
|||||||
@ -20,9 +20,9 @@
|
|||||||
({ \
|
({ \
|
||||||
const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
|
const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \
|
||||||
uint32_t val_m; \
|
uint32_t val_m; \
|
||||||
asm volatile("lw %[val_m], %[psrc_lw_m] \n" \
|
asm("lw %[val_m], %[psrc_lw_m] \n" \
|
||||||
: [val_m] "=r"(val_m) \
|
: [val_m] "=r"(val_m) \
|
||||||
: [psrc_lw_m] "m"(*psrc_lw_m)); \
|
: [psrc_lw_m] "m"(*psrc_lw_m)); \
|
||||||
val_m; \
|
val_m; \
|
||||||
})
|
})
|
||||||
|
|
||||||
@ -31,9 +31,9 @@
|
|||||||
({ \
|
({ \
|
||||||
const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
|
const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \
|
||||||
uint64_t val_m = 0; \
|
uint64_t val_m = 0; \
|
||||||
asm volatile("ld %[val_m], %[psrc_ld_m] \n" \
|
asm("ld %[val_m], %[psrc_ld_m] \n" \
|
||||||
: [val_m] "=r"(val_m) \
|
: [val_m] "=r"(val_m) \
|
||||||
: [psrc_ld_m] "m"(*psrc_ld_m)); \
|
: [psrc_ld_m] "m"(*psrc_ld_m)); \
|
||||||
val_m; \
|
val_m; \
|
||||||
})
|
})
|
||||||
#else // !(__mips == 64)
|
#else // !(__mips == 64)
|
||||||
@ -55,9 +55,9 @@
|
|||||||
({ \
|
({ \
|
||||||
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
|
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||||
uint32_t val_m = (val); \
|
uint32_t val_m = (val); \
|
||||||
asm volatile("sw %[val_m], %[pdst_sw_m] \n" \
|
asm("sw %[val_m], %[pdst_sw_m] \n" \
|
||||||
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
||||||
: [val_m] "r"(val_m)); \
|
: [val_m] "r"(val_m)); \
|
||||||
})
|
})
|
||||||
|
|
||||||
#if (__mips == 64)
|
#if (__mips == 64)
|
||||||
@ -65,9 +65,9 @@
|
|||||||
({ \
|
({ \
|
||||||
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
|
uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||||
uint64_t val_m = (val); \
|
uint64_t val_m = (val); \
|
||||||
asm volatile("sd %[val_m], %[pdst_sd_m] \n" \
|
asm("sd %[val_m], %[pdst_sd_m] \n" \
|
||||||
: [pdst_sd_m] "=m"(*pdst_sd_m) \
|
: [pdst_sd_m] "=m"(*pdst_sd_m) \
|
||||||
: [val_m] "r"(val_m)); \
|
: [val_m] "r"(val_m)); \
|
||||||
})
|
})
|
||||||
#else // !(__mips == 64)
|
#else // !(__mips == 64)
|
||||||
#define SD(val, pdst) \
|
#define SD(val, pdst) \
|
||||||
@ -86,8 +86,7 @@
|
|||||||
uint8_t* psrc_lw_m = (uint8_t*)(psrc); \
|
uint8_t* psrc_lw_m = (uint8_t*)(psrc); \
|
||||||
uint32_t val_lw_m; \
|
uint32_t val_lw_m; \
|
||||||
\
|
\
|
||||||
__asm__ volatile( \
|
asm("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
|
||||||
"lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
|
|
||||||
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
|
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
|
||||||
\
|
\
|
||||||
: [val_lw_m] "=&r"(val_lw_m) \
|
: [val_lw_m] "=&r"(val_lw_m) \
|
||||||
@ -102,8 +101,7 @@
|
|||||||
uint8_t* psrc_ld_m = (uint8_t*)(psrc); \
|
uint8_t* psrc_ld_m = (uint8_t*)(psrc); \
|
||||||
uint64_t val_ld_m = 0; \
|
uint64_t val_ld_m = 0; \
|
||||||
\
|
\
|
||||||
__asm__ volatile( \
|
asm("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
|
||||||
"ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
|
|
||||||
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
|
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
|
||||||
\
|
\
|
||||||
: [val_ld_m] "=&r"(val_ld_m) \
|
: [val_ld_m] "=&r"(val_ld_m) \
|
||||||
@ -130,9 +128,9 @@
|
|||||||
({ \
|
({ \
|
||||||
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
|
uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
|
||||||
uint32_t val_m = (val); \
|
uint32_t val_m = (val); \
|
||||||
asm volatile("usw %[val_m], %[pdst_sw_m] \n" \
|
asm("usw %[val_m], %[pdst_sw_m] \n" \
|
||||||
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
: [pdst_sw_m] "=m"(*pdst_sw_m) \
|
||||||
: [val_m] "r"(val_m)); \
|
: [val_m] "r"(val_m)); \
|
||||||
})
|
})
|
||||||
|
|
||||||
#define SD(val, pdst) \
|
#define SD(val, pdst) \
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1888
|
#define LIBYUV_VERSION 1889
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -29,7 +29,8 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
|||||||
int count) {
|
int count) {
|
||||||
uint64_t diff;
|
uint64_t diff;
|
||||||
|
|
||||||
asm("xor %3,%3 \n"
|
asm volatile (
|
||||||
|
"xor %3,%3 \n"
|
||||||
"xor %%r8,%%r8 \n"
|
"xor %%r8,%%r8 \n"
|
||||||
"xor %%r9,%%r9 \n"
|
"xor %%r9,%%r9 \n"
|
||||||
"xor %%r10,%%r10 \n"
|
"xor %%r10,%%r10 \n"
|
||||||
@ -76,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
|
|||||||
int count) {
|
int count) {
|
||||||
uint32_t diff = 0u;
|
uint32_t diff = 0u;
|
||||||
|
|
||||||
asm(
|
asm volatile (
|
||||||
// Process 16 bytes per loop.
|
// Process 16 bytes per loop.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -120,7 +121,8 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
|
|||||||
int count) {
|
int count) {
|
||||||
uint32_t diff;
|
uint32_t diff;
|
||||||
|
|
||||||
asm("movdqa %4,%%xmm2 \n"
|
asm volatile (
|
||||||
|
"movdqa %4,%%xmm2 \n"
|
||||||
"movdqa %5,%%xmm3 \n"
|
"movdqa %5,%%xmm3 \n"
|
||||||
"pxor %%xmm0,%%xmm0 \n"
|
"pxor %%xmm0,%%xmm0 \n"
|
||||||
"pxor %%xmm1,%%xmm1 \n"
|
"pxor %%xmm1,%%xmm1 \n"
|
||||||
@ -178,7 +180,8 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
|
|||||||
int count) {
|
int count) {
|
||||||
uint32_t diff;
|
uint32_t diff;
|
||||||
|
|
||||||
asm("vbroadcastf128 %4,%%ymm2 \n"
|
asm volatile (
|
||||||
|
"vbroadcastf128 %4,%%ymm2 \n"
|
||||||
"vbroadcastf128 %5,%%ymm3 \n"
|
"vbroadcastf128 %5,%%ymm3 \n"
|
||||||
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
"vpxor %%ymm0,%%ymm0,%%ymm0 \n"
|
||||||
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
|
"vpxor %%ymm1,%%ymm1,%%ymm1 \n"
|
||||||
@ -231,7 +234,8 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
int count) {
|
int count) {
|
||||||
uint32_t sse;
|
uint32_t sse;
|
||||||
asm("pxor %%xmm0,%%xmm0 \n"
|
asm volatile (
|
||||||
|
"pxor %%xmm0,%%xmm0 \n"
|
||||||
"pxor %%xmm5,%%xmm5 \n"
|
"pxor %%xmm5,%%xmm5 \n"
|
||||||
|
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -296,7 +300,8 @@ static const uvec32 kHashMul3 = {
|
|||||||
|
|
||||||
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
|
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
|
||||||
uint32_t hash;
|
uint32_t hash;
|
||||||
asm("movd %2,%%xmm0 \n"
|
asm volatile (
|
||||||
|
"movd %2,%%xmm0 \n"
|
||||||
"pxor %%xmm7,%%xmm7 \n"
|
"pxor %%xmm7,%%xmm7 \n"
|
||||||
"movdqa %4,%%xmm6 \n"
|
"movdqa %4,%%xmm6 \n"
|
||||||
|
|
||||||
|
|||||||
@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
|||||||
int count) {
|
int count) {
|
||||||
uint32_t diff;
|
uint32_t diff;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 q4, #0 \n" // accumulator
|
"vmov.u16 q4, #0 \n" // accumulator
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
int count) {
|
int count) {
|
||||||
uint32_t sse;
|
uint32_t sse;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 q8, #0 \n"
|
"vmov.u8 q8, #0 \n"
|
||||||
"vmov.u8 q10, #0 \n"
|
"vmov.u8 q10, #0 \n"
|
||||||
"vmov.u8 q9, #0 \n"
|
"vmov.u8 q9, #0 \n"
|
||||||
|
|||||||
@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
int count) {
|
int count) {
|
||||||
uint32_t diff;
|
uint32_t diff;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v4.8h, #0 \n"
|
"movi v4.8h, #0 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
int count) {
|
int count) {
|
||||||
uint32_t sse;
|
uint32_t sse;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v16.16b, #0 \n"
|
"movi v16.16b, #0 \n"
|
||||||
"movi v17.16b, #0 \n"
|
"movi v17.16b, #0 \n"
|
||||||
"movi v18.16b, #0 \n"
|
"movi v18.16b, #0 \n"
|
||||||
@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
int count) {
|
int count) {
|
||||||
uint32_t diff;
|
uint32_t diff;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v4.4s, #0 \n"
|
"movi v4.4s, #0 \n"
|
||||||
"movi v5.4s, #0 \n"
|
"movi v5.4s, #0 \n"
|
||||||
"movi v6.16b, #1 \n"
|
"movi v6.16b, #1 \n"
|
||||||
@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
|
|||||||
int count) {
|
int count) {
|
||||||
// count is guaranteed to be a multiple of 32.
|
// count is guaranteed to be a multiple of 32.
|
||||||
uint32_t sse;
|
uint32_t sse;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v4.4s, #0 \n"
|
"movi v4.4s, #0 \n"
|
||||||
"movi v5.4s, #0 \n"
|
"movi v5.4s, #0 \n"
|
||||||
|
|
||||||
|
|||||||
@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width) {
|
int width) {
|
||||||
asm(
|
asm volatile (
|
||||||
// Read in the data from the source pointer.
|
// Read in the data from the source pointer.
|
||||||
// First round of bit swap.
|
// First round of bit swap.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width) {
|
int width) {
|
||||||
asm(
|
asm volatile (
|
||||||
// Read in the data from the source pointer.
|
// Read in the data from the source pointer.
|
||||||
// First round of bit swap.
|
// First round of bit swap.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
|
|||||||
uint8_t* dst_b,
|
uint8_t* dst_b,
|
||||||
int dst_stride_b,
|
int dst_stride_b,
|
||||||
int width) {
|
int width) {
|
||||||
asm(
|
asm volatile (
|
||||||
// Read in the data from the source pointer.
|
// Read in the data from the source pointer.
|
||||||
// First round of bit swap.
|
// First round of bit swap.
|
||||||
LABELALIGN
|
LABELALIGN
|
||||||
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width) {
|
int width) {
|
||||||
asm(
|
asm volatile (
|
||||||
// Main loop transpose 4x4. Read a column, write a row.
|
// Main loop transpose 4x4. Read a column, write a row.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n" // a b c d
|
"movdqu (%0),%%xmm0 \n" // a b c d
|
||||||
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width) {
|
int width) {
|
||||||
asm(
|
asm volatile (
|
||||||
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
|
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%xmm0 \n" // a b c d
|
"vmovdqu (%0),%%xmm0 \n" // a b c d
|
||||||
|
|||||||
@ -51,6 +51,16 @@ extern "C" {
|
|||||||
out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
|
out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void TransposeWx16_C(const uint8_t* src,
|
||||||
|
int src_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int dst_stride,
|
||||||
|
int width) {
|
||||||
|
TransposeWx8_C(src, src_stride, dst, dst_stride, width);
|
||||||
|
TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
|
||||||
|
width);
|
||||||
|
}
|
||||||
|
|
||||||
void TransposeUVWx16_C(const uint8_t* src,
|
void TransposeUVWx16_C(const uint8_t* src,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
uint8_t* dst_a,
|
uint8_t* dst_a,
|
||||||
|
|||||||
@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src,
|
|||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* temp;
|
const uint8_t* temp;
|
||||||
asm(
|
asm volatile (
|
||||||
// loops are on blocks of 8. loop will stop when
|
// loops are on blocks of 8. loop will stop when
|
||||||
// counter gets to or below 0. starting the counter
|
// counter gets to or below 0. starting the counter
|
||||||
// at w-8 allow for this
|
// at w-8 allow for this
|
||||||
@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
|||||||
int dst_stride_b,
|
int dst_stride_b,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* temp;
|
const uint8_t* temp;
|
||||||
asm(
|
asm volatile (
|
||||||
// loops are on blocks of 8. loop will stop when
|
// loops are on blocks of 8. loop will stop when
|
||||||
// counter gets to or below 0. starting the counter
|
// counter gets to or below 0. starting the counter
|
||||||
// at w-8 allow for this
|
// at w-8 allow for this
|
||||||
@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
|
|||||||
uint8_t* dst1 = dst + dst_stride;
|
uint8_t* dst1 = dst + dst_stride;
|
||||||
uint8_t* dst2 = dst1 + dst_stride;
|
uint8_t* dst2 = dst1 + dst_stride;
|
||||||
uint8_t* dst3 = dst2 + dst_stride;
|
uint8_t* dst3 = dst2 + dst_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// Main loop transpose 4x4. Read a column, write a row.
|
// Main loop transpose 4x4. Read a column, write a row.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
|
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
|
||||||
|
|||||||
@ -27,7 +27,8 @@ void TransposeWx16_NEON(const uint8_t* src,
|
|||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* src_temp;
|
const uint8_t* src_temp;
|
||||||
asm("1: \n"
|
asm volatile (
|
||||||
|
"1: \n"
|
||||||
"mov %[src_temp], %[src] \n"
|
"mov %[src_temp], %[src] \n"
|
||||||
|
|
||||||
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
|
"ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n"
|
||||||
@ -144,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
|||||||
int dst_stride_b,
|
int dst_stride_b,
|
||||||
int width) {
|
int width) {
|
||||||
const uint8_t* temp;
|
const uint8_t* temp;
|
||||||
asm(
|
asm volatile (
|
||||||
// loops are on blocks of 8. loop will stop when
|
// loops are on blocks of 8. loop will stop when
|
||||||
// counter gets to or below 0. starting the counter
|
// counter gets to or below 0. starting the counter
|
||||||
// at w-8 allow for this
|
// at w-8 allow for this
|
||||||
@ -238,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
|
|||||||
uint8_t* dst1 = dst + dst_stride;
|
uint8_t* dst1 = dst + dst_stride;
|
||||||
uint8_t* dst2 = dst1 + dst_stride;
|
uint8_t* dst2 = dst1 + dst_stride;
|
||||||
uint8_t* dst3 = dst2 + dst_stride;
|
uint8_t* dst3 = dst2 + dst_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// Main loop transpose 4x4. Read a column, write a row.
|
// Main loop transpose 4x4. Read a column, write a row.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
|
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -2037,7 +2037,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
|
|||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||||
@ -2099,7 +2099,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||||
@ -2163,7 +2163,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
|
|||||||
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
|
||||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
|
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
|
||||||
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
|
||||||
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
|
||||||
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
|
||||||
|
|||||||
@ -2805,8 +2805,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
|
||||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||||
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
|
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
|
||||||
@ -2864,8 +2863,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
|
||||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||||
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
|
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
|
||||||
@ -2922,8 +2920,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
|
|||||||
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
|
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
|
||||||
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
|
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
|
||||||
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
|
||||||
asm volatile(
|
asm("vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
||||||
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
|
|
||||||
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
|
||||||
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
|
||||||
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
|
"vldrepl.h $vr3, %3, 4 \n\t" // load rgbconstants
|
||||||
|
|||||||
@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUV444 YUVTORGB
|
"1: \n" READYUV444 YUVTORGB
|
||||||
@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"1: \n" READYUV444 YUVTORGB
|
"1: \n" READYUV444 YUVTORGB
|
||||||
RGBTORGB8
|
RGBTORGB8
|
||||||
@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUV422 YUVTORGB
|
"1: \n" READYUV422 YUVTORGB
|
||||||
@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"1: \n" READYUV444 YUVTORGB
|
"1: \n" READYUV444 YUVTORGB
|
||||||
RGBTORGB8
|
RGBTORGB8
|
||||||
@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"1: \n" READYUV422 YUVTORGB
|
"1: \n" READYUV422 YUVTORGB
|
||||||
RGBTORGB8
|
RGBTORGB8
|
||||||
@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgba,
|
uint8_t* dst_rgba,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUV422 YUVTORGB
|
"1: \n" READYUV422 YUVTORGB
|
||||||
@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUV422 YUVTORGB
|
"1: \n" READYUV422 YUVTORGB
|
||||||
@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb565,
|
uint8_t* dst_rgb565,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUV422 YUVTORGB
|
"1: \n" READYUV422 YUVTORGB
|
||||||
@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb1555,
|
uint8_t* dst_argb1555,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"1: \n" READYUV422 YUVTORGB
|
"1: \n" READYUV422 YUVTORGB
|
||||||
RGBTORGB8
|
RGBTORGB8
|
||||||
@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb4444,
|
uint8_t* dst_argb4444,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"vmov.u8 d7, #0x0f \n" // vbic bits to clear
|
"vmov.u8 d7, #0x0f \n" // vbic bits to clear
|
||||||
@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUV400 YUVTORGB
|
"1: \n" READYUV400 YUVTORGB
|
||||||
@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d23, #255 \n"
|
"vmov.u8 d23, #255 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d20}, [%0]! \n"
|
"vld1.8 {d20}, [%0]! \n"
|
||||||
@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
||||||
@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READNV21 YUVTORGB RGBTORGB8
|
"1: \n" READNV21 YUVTORGB RGBTORGB8
|
||||||
@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
||||||
@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READNV21 YUVTORGB RGBTORGB8
|
"1: \n" READNV21 YUVTORGB RGBTORGB8
|
||||||
@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
|
|||||||
uint8_t* dst_rgb565,
|
uint8_t* dst_rgb565,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
"1: \n" READNV12 YUVTORGB RGBTORGB8
|
||||||
@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READYUY2 YUVTORGB RGBTORGB8
|
"1: \n" READYUY2 YUVTORGB RGBTORGB8
|
||||||
@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
YUVTORGB_SETUP
|
YUVTORGB_SETUP
|
||||||
"vmov.u8 d6, #255 \n"
|
"vmov.u8 d6, #255 \n"
|
||||||
"1: \n" READUYVY YUVTORGB RGBTORGB8
|
"1: \n" READUYVY YUVTORGB RGBTORGB8
|
||||||
@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
|
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
|
||||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||||
@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
|
|||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
|
"vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||||
@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
|
|||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
|
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||||
@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {d0, d1}, [%0], %4 \n"
|
"vld2.8 {d0, d1}, [%0], %4 \n"
|
||||||
"subs %3, %3, #16 \n"
|
"subs %3, %3, #16 \n"
|
||||||
@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
|||||||
ptrdiff_t src_uv_tile_stride,
|
ptrdiff_t src_uv_tile_stride,
|
||||||
uint8_t* dst_yuy2,
|
uint8_t* dst_yuy2,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
|
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
|
||||||
"pld [%0, #1792] \n"
|
"pld [%0, #1792] \n"
|
||||||
@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
|||||||
ptrdiff_t src_uv_tile_stride,
|
ptrdiff_t src_uv_tile_stride,
|
||||||
uint8_t* dst_yuy2,
|
uint8_t* dst_yuy2,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
|
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
|
||||||
"vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
|
"vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
|
||||||
@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q14}, [%0]! \n" // Load lower bits.
|
"vld1.8 {q14}, [%0]! \n" // Load lower bits.
|
||||||
"vld1.8 {q9}, [%0]! \n" // Load upper bits row
|
"vld1.8 {q9}, [%0]! \n" // Load upper bits row
|
||||||
@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
|
|||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load U
|
"vld1.8 {q0}, [%0]! \n" // load U
|
||||||
"vld1.8 {q1}, [%1]! \n" // load V
|
"vld1.8 {q1}, [%1]! \n" // load V
|
||||||
@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
|
|||||||
uint8_t* dst_g,
|
uint8_t* dst_g,
|
||||||
uint8_t* dst_b,
|
uint8_t* dst_b,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
|
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
|
||||||
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
|
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
|
||||||
@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
uint8_t* dst_rgb,
|
uint8_t* dst_rgb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load R
|
"vld1.8 {q0}, [%0]! \n" // load R
|
||||||
"vld1.8 {q1}, [%1]! \n" // load G
|
"vld1.8 {q1}, [%1]! \n" // load G
|
||||||
@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_b,
|
uint8_t* dst_b,
|
||||||
uint8_t* dst_a,
|
uint8_t* dst_a,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
|
||||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
|
||||||
@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
|
|||||||
const uint8_t* src_a,
|
const uint8_t* src_a,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q2}, [%0]! \n" // load R
|
"vld1.8 {q2}, [%0]! \n" // load R
|
||||||
"vld1.8 {q1}, [%1]! \n" // load G
|
"vld1.8 {q1}, [%1]! \n" // load G
|
||||||
@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_g,
|
uint8_t* dst_g,
|
||||||
uint8_t* dst_b,
|
uint8_t* dst_b,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
|
||||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
|
||||||
@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
|
|||||||
const uint8_t* src_b,
|
const uint8_t* src_b,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 q3, #255 \n" // load A(255)
|
"vmov.u8 q3, #255 \n" // load A(255)
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q2}, [%0]! \n" // load R
|
"vld1.8 {q2}, [%0]! \n" // load R
|
||||||
@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
|
|||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 10 - depth;
|
int shift = 10 - depth;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u32 q14, #1023 \n"
|
"vmov.u32 q14, #1023 \n"
|
||||||
"vdup.32 q15, %5 \n"
|
"vdup.32 q15, %5 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
|
|||||||
uint8_t* dst_ar30,
|
uint8_t* dst_ar30,
|
||||||
int /* depth */,
|
int /* depth */,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u32 q14, #1023 \n"
|
"vmov.u32 q14, #1023 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {d4}, [%2]! \n" // B
|
"vld1.16 {d4}, [%2]! \n" // B
|
||||||
@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
|
|||||||
int width) {
|
int width) {
|
||||||
int shift = 16 - depth;
|
int shift = 16 - depth;
|
||||||
int mask = (1 << depth) - 1;
|
int mask = (1 << depth) - 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"vdup.u16 q15, %6 \n"
|
"vdup.u16 q15, %6 \n"
|
||||||
"vdup.u16 q14, %7 \n"
|
"vdup.u16 q14, %7 \n"
|
||||||
@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
|
|||||||
int width) {
|
int width) {
|
||||||
int shift = 16 - depth;
|
int shift = 16 - depth;
|
||||||
int mask = (1 << depth) - 1;
|
int mask = (1 << depth) - 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"vmov.u8 q3, #0xff \n" // A (0xffff)
|
"vmov.u8 q3, #0xff \n" // A (0xffff)
|
||||||
"vdup.u16 q15, %5 \n"
|
"vdup.u16 q15, %5 \n"
|
||||||
@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
|
|||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 8 - depth;
|
int shift = 8 - depth;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"vdup.16 q15, %6 \n"
|
"vdup.16 q15, %6 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
|||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 8 - depth;
|
int shift = 8 - depth;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"vdup.16 q15, %5 \n"
|
"vdup.16 q15, %5 \n"
|
||||||
"vmov.u8 d6, #0xff \n" // A (0xff)
|
"vmov.u8 d6, #0xff \n" // A (0xff)
|
||||||
@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
|
|||||||
|
|
||||||
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
|
||||||
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
|
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
|
||||||
"subs %2, %2, #32 \n" // 32 processed per loop
|
"subs %2, %2, #32 \n" // 32 processed per loop
|
||||||
@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
|||||||
|
|
||||||
// SetRow writes 'width' bytes using an 8 bit value repeated.
|
// SetRow writes 'width' bytes using an 8 bit value repeated.
|
||||||
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
|
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.8 q0, %2 \n" // duplicate 16 bytes
|
"vdup.8 q0, %2 \n" // duplicate 16 bytes
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"subs %1, %1, #16 \n" // 16 bytes per loop
|
"subs %1, %1, #16 \n" // 16 bytes per loop
|
||||||
@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
|
|||||||
|
|
||||||
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
|
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
|
||||||
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
|
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"subs %1, %1, #4 \n" // 4 pixels per loop
|
"subs %1, %1, #4 \n" // 4 pixels per loop
|
||||||
@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// Start at end of source row.
|
// Start at end of source row.
|
||||||
"add %0, %0, %2 \n"
|
"add %0, %0, %2 \n"
|
||||||
"sub %0, %0, #32 \n" // 32 bytes per loop
|
"sub %0, %0, #32 \n" // 32 bytes per loop
|
||||||
@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// Start at end of source row.
|
// Start at end of source row.
|
||||||
"mov r12, #-16 \n"
|
"mov r12, #-16 \n"
|
||||||
"add %0, %0, %2, lsl #1 \n"
|
"add %0, %0, %2, lsl #1 \n"
|
||||||
@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// Start at end of source row.
|
// Start at end of source row.
|
||||||
"mov r12, #-16 \n"
|
"mov r12, #-16 \n"
|
||||||
"add %0, %0, %3, lsl #1 \n"
|
"add %0, %0, %3, lsl #1 \n"
|
||||||
@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %0, %0, %2, lsl #2 \n"
|
"add %0, %0, %2, lsl #2 \n"
|
||||||
"sub %0, #32 \n"
|
"sub %0, #32 \n"
|
||||||
|
|
||||||
@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
|
|||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
int width) {
|
int width) {
|
||||||
src_rgb24 += width * 3 - 24;
|
src_rgb24 += width * 3 - 24;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
|
"vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
|
||||||
"subs %2, #8 \n" // 8 pixels per loop.
|
"subs %2, #8 \n" // 8 pixels per loop.
|
||||||
@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
|
|||||||
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d4, #255 \n" // Alpha
|
"vmov.u8 d4, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
|
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
|
||||||
@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d4, #255 \n" // Alpha
|
"vmov.u8 d4, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||||
@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d0, #255 \n" // Alpha
|
"vmov.u8 d0, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||||
@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
|
|||||||
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d3, #255 \n" // Alpha
|
"vmov.u8 d3, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
|
||||||
@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
|
|||||||
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d3, #255 \n" // Alpha
|
"vmov.u8 d3, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
|
||||||
@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
|
|||||||
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d3, #255 \n" // Alpha
|
"vmov.u8 d3, #255 \n" // Alpha
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
|
||||||
@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
|
|||||||
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_rgb24,
|
uint8_t* dst_rgb24,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
|
||||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||||
@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
|
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
|
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||||
@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||||
@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
|
||||||
@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // stride + src_uyvy
|
"add %1, %0, %1 \n" // stride + src_uyvy
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
|
||||||
@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
|
|||||||
int stride_yuy2,
|
int stride_yuy2,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
||||||
@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const uint8_t* shuffler,
|
const uint8_t* shuffler,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {q2}, [%3] \n" // shuffler
|
"vld1.8 {q2}, [%3] \n" // shuffler
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
|
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
|
||||||
@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
|
|||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_yuy2,
|
uint8_t* dst_yuy2,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
|
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
|
||||||
"vld1.8 {d1}, [%1]! \n" // load 8 Us
|
"vld1.8 {d1}, [%1]! \n" // load 8 Us
|
||||||
@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
|||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_uyvy,
|
uint8_t* dst_uyvy,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
|
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
|
||||||
"vld1.8 {d0}, [%1]! \n" // load 8 Us
|
"vld1.8 {d0}, [%1]! \n" // load 8 Us
|
||||||
@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
|
|||||||
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
|
void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_rgb565,
|
uint8_t* dst_rgb565,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_rgb,
|
uint8_t* dst_rgb,
|
||||||
uint32_t dither4,
|
uint32_t dither4,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.32 d7, %2 \n" // dither4
|
"vdup.32 d7, %2 \n" // dither4
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
|
"vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
|
||||||
@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
|
|||||||
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb1555,
|
uint8_t* dst_argb1555,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||||
@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
|
|||||||
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb4444,
|
uint8_t* dst_argb4444,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d7, #0x0f \n" // bits to clear with
|
"vmov.u8 d7, #0x0f \n" // bits to clear with
|
||||||
// vbic.
|
// vbic.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
|||||||
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_a,
|
uint8_t* dst_a,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
|
||||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
|
||||||
@ -1838,7 +1838,7 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbUVConstants* rgbuvconstants) {
|
const struct RgbUVConstants* rgbuvconstants) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants
|
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants
|
||||||
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
|
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
|
||||||
@ -2366,7 +2366,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||||
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
||||||
// coefficient
|
// coefficient
|
||||||
@ -2432,7 +2432,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||||
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
||||||
// coefficient
|
// coefficient
|
||||||
@ -2498,7 +2498,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
|||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // src_stride + src_argb
|
"add %1, %0, %1 \n" // src_stride + src_argb
|
||||||
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
|
||||||
// coefficient
|
// coefficient
|
||||||
@ -2550,7 +2550,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
||||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
||||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
||||||
@ -2576,7 +2576,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
|||||||
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
||||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
||||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
||||||
@ -2602,7 +2602,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
|
|||||||
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
||||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
||||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
||||||
@ -2628,7 +2628,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
|||||||
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
void ARGBToAR64Row_NEON(const uint8_t* src_argb,
|
||||||
uint16_t* dst_ar64,
|
uint16_t* dst_ar64,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n"
|
"vld1.8 {q0}, [%0]! \n"
|
||||||
"vld1.8 {q2}, [%0]! \n"
|
"vld1.8 {q2}, [%0]! \n"
|
||||||
@ -2651,7 +2651,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
|
|||||||
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
||||||
uint16_t* dst_ab64,
|
uint16_t* dst_ab64,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {q4}, [%3] \n" // shuffler
|
"vld1.8 {q4}, [%3] \n" // shuffler
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -2677,7 +2677,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
|
|||||||
void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
|
void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q0}, [%0]! \n"
|
"vld1.16 {q0}, [%0]! \n"
|
||||||
"vld1.16 {q1}, [%0]! \n"
|
"vld1.16 {q1}, [%0]! \n"
|
||||||
@ -2703,7 +2703,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
|
|||||||
void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {d8}, [%3] \n" // shuffler
|
"vld1.8 {d8}, [%3] \n" // shuffler
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -2756,7 +2756,7 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||||
"vdup.u8 d20, d0[0] \n"
|
"vdup.u8 d20, d0[0] \n"
|
||||||
"vdup.u8 d21, d0[1] \n"
|
"vdup.u8 d21, d0[1] \n"
|
||||||
@ -2806,7 +2806,7 @@ void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||||
"vdup.u8 d20, d0[0] \n"
|
"vdup.u8 d20, d0[0] \n"
|
||||||
"vdup.u8 d21, d0[1] \n"
|
"vdup.u8 d21, d0[1] \n"
|
||||||
@ -2850,7 +2850,7 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
|||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width,
|
int width,
|
||||||
const struct RgbConstants* rgbconstants) {
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||||
"vdup.u8 d20, d0[0] \n"
|
"vdup.u8 d20, d0[0] \n"
|
||||||
"vdup.u8 d21, d0[1] \n"
|
"vdup.u8 d21, d0[1] \n"
|
||||||
@ -2902,7 +2902,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
|
|||||||
int dst_width,
|
int dst_width,
|
||||||
int source_y_fraction) {
|
int source_y_fraction) {
|
||||||
int y1_fraction = source_y_fraction;
|
int y1_fraction = source_y_fraction;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"cmp %4, #0 \n"
|
"cmp %4, #0 \n"
|
||||||
"beq 100f \n"
|
"beq 100f \n"
|
||||||
"add %2, %1 \n"
|
"add %2, %1 \n"
|
||||||
@ -2964,7 +2964,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
|
|||||||
int y0_fraction = 256 - y1_fraction;
|
int y0_fraction = 256 - y1_fraction;
|
||||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"cmp %4, #0 \n"
|
"cmp %4, #0 \n"
|
||||||
"beq 100f \n"
|
"beq 100f \n"
|
||||||
"cmp %4, #128 \n"
|
"cmp %4, #128 \n"
|
||||||
@ -3019,7 +3019,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
|||||||
const uint8_t* src_argb1,
|
const uint8_t* src_argb1,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"subs %3, #8 \n"
|
"subs %3, #8 \n"
|
||||||
"blt 89f \n"
|
"blt 89f \n"
|
||||||
// Blend 8 pixels.
|
// Blend 8 pixels.
|
||||||
@ -3078,7 +3078,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
|||||||
void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up
|
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up
|
||||||
|
|
||||||
// Attenuate 8 pixels.
|
// Attenuate 8 pixels.
|
||||||
@ -3107,7 +3107,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
|
|||||||
int interval_size,
|
int interval_size,
|
||||||
int interval_offset,
|
int interval_offset,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.u16 q8, %2 \n"
|
"vdup.u16 q8, %2 \n"
|
||||||
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
|
"vshr.u16 q8, q8, #1 \n" // scale >>= 1
|
||||||
"vdup.u16 q9, %3 \n" // interval multiply.
|
"vdup.u16 q9, %3 \n" // interval multiply.
|
||||||
@ -3149,7 +3149,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width,
|
int width,
|
||||||
uint32_t value) {
|
uint32_t value) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.u32 q0, %3 \n" // duplicate scale value.
|
"vdup.u32 q0, %3 \n" // duplicate scale value.
|
||||||
"vzip.u8 d0, d1 \n" // d0 aarrggbb.
|
"vzip.u8 d0, d1 \n" // d0 aarrggbb.
|
||||||
"vshr.u16 q0, q0, #1 \n" // scale / 2.
|
"vshr.u16 q0, q0, #1 \n" // scale / 2.
|
||||||
@ -3183,7 +3183,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
|
|||||||
// Similar to ARGBToYJ but stores ARGB.
|
// Similar to ARGBToYJ but stores ARGB.
|
||||||
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
|
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
|
||||||
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
|
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
|
||||||
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
|
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
|
||||||
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
|
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
|
||||||
@ -3210,7 +3210,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
|
|||||||
// g = (r * 45 + g * 88 + b * 22) >> 7
|
// g = (r * 45 + g * 88 + b * 22) >> 7
|
||||||
// r = (r * 50 + g * 98 + b * 24) >> 7
|
// r = (r * 50 + g * 98 + b * 24) >> 7
|
||||||
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
|
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d20, #17 \n" // BB coefficient
|
"vmov.u8 d20, #17 \n" // BB coefficient
|
||||||
"vmov.u8 d21, #68 \n" // BG coefficient
|
"vmov.u8 d21, #68 \n" // BG coefficient
|
||||||
"vmov.u8 d22, #35 \n" // BR coefficient
|
"vmov.u8 d22, #35 \n" // BR coefficient
|
||||||
@ -3251,7 +3251,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
const int8_t* matrix_argb,
|
const int8_t* matrix_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
|
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
|
||||||
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
|
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
|
||||||
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
|
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
|
||||||
@ -3310,7 +3310,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
|
|||||||
const uint8_t* src_argb1,
|
const uint8_t* src_argb1,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||||
@ -3339,7 +3339,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
|
|||||||
const uint8_t* src_argb1,
|
const uint8_t* src_argb1,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||||
@ -3362,7 +3362,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
|
|||||||
const uint8_t* src_argb1,
|
const uint8_t* src_argb1,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||||
@ -3389,7 +3389,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
|
|||||||
const uint8_t* src_sobely,
|
const uint8_t* src_sobely,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d3, #255 \n" // alpha
|
"vmov.u8 d3, #255 \n" // alpha
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -3414,7 +3414,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
|
|||||||
const uint8_t* src_sobely,
|
const uint8_t* src_sobely,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
|
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
|
||||||
@ -3440,7 +3440,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
|
|||||||
const uint8_t* src_sobely,
|
const uint8_t* src_sobely,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d3, #255 \n" // alpha
|
"vmov.u8 d3, #255 \n" // alpha
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -3467,7 +3467,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
|
|||||||
const uint8_t* src_y2,
|
const uint8_t* src_y2,
|
||||||
uint8_t* dst_sobelx,
|
uint8_t* dst_sobelx,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d0}, [%0],%5 \n" // top
|
"vld1.8 {d0}, [%0],%5 \n" // top
|
||||||
"vld1.8 {d1}, [%0],%6 \n"
|
"vld1.8 {d1}, [%0],%6 \n"
|
||||||
@ -3505,7 +3505,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
|
|||||||
const uint8_t* src_y1,
|
const uint8_t* src_y1,
|
||||||
uint8_t* dst_sobely,
|
uint8_t* dst_sobely,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d0}, [%0],%4 \n" // left
|
"vld1.8 {d0}, [%0],%4 \n" // left
|
||||||
"vld1.8 {d1}, [%1],%4 \n"
|
"vld1.8 {d1}, [%1],%4 \n"
|
||||||
@ -3542,7 +3542,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
|
|||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
float /*unused*/,
|
float /*unused*/,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||||
@ -3568,7 +3568,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
|||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
"vld1.8 {q1}, [%0]! \n" // load 8 shorts
|
||||||
@ -3594,7 +3594,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
|
|||||||
float* dst,
|
float* dst,
|
||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
|
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
|
||||||
@ -3623,7 +3623,7 @@ void GaussCol_NEON(const uint16_t* src0,
|
|||||||
const uint16_t* src4,
|
const uint16_t* src4,
|
||||||
uint32_t* dst,
|
uint32_t* dst,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 d6, #4 \n" // constant 4
|
"vmov.u16 d6, #4 \n" // constant 4
|
||||||
"vmov.u16 d7, #6 \n" // constant 6
|
"vmov.u16 d7, #6 \n" // constant 6
|
||||||
|
|
||||||
@ -3660,7 +3660,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
|||||||
const uint32_t* src1 = src + 1;
|
const uint32_t* src1 = src + 1;
|
||||||
const uint32_t* src2 = src + 2;
|
const uint32_t* src2 = src + 2;
|
||||||
const uint32_t* src3 = src + 3;
|
const uint32_t* src3 = src + 3;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u32 q10, #4 \n" // constant 4
|
"vmov.u32 q10, #4 \n" // constant 4
|
||||||
"vmov.u32 q11, #6 \n" // constant 6
|
"vmov.u32 q11, #6 \n" // constant 6
|
||||||
|
|
||||||
@ -3698,7 +3698,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
|||||||
const uint8_t* src_vu,
|
const uint8_t* src_vu,
|
||||||
uint8_t* dst_yuv24,
|
uint8_t* dst_yuv24,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
|
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
|
||||||
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
|
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
|
||||||
@ -3722,7 +3722,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
|||||||
int src_stride_ayuv,
|
int src_stride_ayuv,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // src_stride + src_AYUV
|
"add %1, %0, %1 \n" // src_stride + src_AYUV
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
||||||
@ -3753,7 +3753,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
|||||||
int src_stride_ayuv,
|
int src_stride_ayuv,
|
||||||
uint8_t* dst_vu,
|
uint8_t* dst_vu,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1 \n" // src_stride + src_AYUV
|
"add %1, %0, %1 \n" // src_stride + src_AYUV
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
|
||||||
@ -3783,7 +3783,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
|||||||
// Copy row of AYUV Y's into Y.
|
// Copy row of AYUV Y's into Y.
|
||||||
// Similar to ARGBExtractAlphaRow_NEON
|
// Similar to ARGBExtractAlphaRow_NEON
|
||||||
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
|
||||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
|
||||||
@ -3799,7 +3799,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
|||||||
|
|
||||||
// Convert UV plane of NV12 to VU of NV21.
|
// Convert UV plane of NV12 to VU of NV21.
|
||||||
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
|
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
|
||||||
"vld2.8 {d1, d3}, [%0]! \n"
|
"vld2.8 {d1, d3}, [%0]! \n"
|
||||||
@ -3822,7 +3822,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
|||||||
int width) {
|
int width) {
|
||||||
const uint8_t* src_u_1 = src_u + src_stride_u;
|
const uint8_t* src_u_1 = src_u + src_stride_u;
|
||||||
const uint8_t* src_v_1 = src_v + src_stride_v;
|
const uint8_t* src_v_1 = src_v + src_stride_v;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 16 U values
|
"vld1.8 {q0}, [%0]! \n" // load 16 U values
|
||||||
"vld1.8 {q1}, [%2]! \n" // load 16 V values
|
"vld1.8 {q1}, [%2]! \n" // load 16 V values
|
||||||
@ -3853,7 +3853,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
|
|||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = depth - 16; // Negative for right shift.
|
int shift = depth - 16; // Negative for right shift.
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.16 q2, %4 \n"
|
"vdup.16 q2, %4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
|
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
|
||||||
@ -3877,7 +3877,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
|
|||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 16 - depth;
|
int shift = 16 - depth;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.16 q2, %4 \n"
|
"vdup.16 q2, %4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q0}, [%0]! \n" // load 8 U
|
"vld1.16 {q0}, [%0]! \n" // load 8 U
|
||||||
@ -3899,7 +3899,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
|
|||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.16 q2, %3 \n"
|
"vdup.16 q2, %3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q0}, [%0]! \n"
|
"vld1.16 {q0}, [%0]! \n"
|
||||||
@ -3921,7 +3921,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
|||||||
uint16_t* dst_y,
|
uint16_t* dst_y,
|
||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.16 d8, %3 \n"
|
"vdup.16 d8, %3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q2, q3}, [%0]! \n"
|
"vld1.16 {q2, q3}, [%0]! \n"
|
||||||
@ -3953,7 +3953,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
|
|||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vdup.16 q2, %3 \n"
|
"vdup.16 q2, %3 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q0}, [%0]! \n"
|
"vld1.16 {q0}, [%0]! \n"
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -47,7 +47,7 @@ extern "C" {
|
|||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
|
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
|
||||||
{ \
|
{ \
|
||||||
asm volatile("csrwi vxrm, 0"); \
|
asm volatile ("csrwi vxrm, 0"); \
|
||||||
ub = yuvconst->kUVCoeff[0]; \
|
ub = yuvconst->kUVCoeff[0]; \
|
||||||
vr = yuvconst->kUVCoeff[1]; \
|
vr = yuvconst->kUVCoeff[1]; \
|
||||||
ug = yuvconst->kUVCoeff[2]; \
|
ug = yuvconst->kUVCoeff[2]; \
|
||||||
@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
|
|||||||
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
|
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
|
||||||
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) sets to round-to-nearest-up mode(0).
|
// register) sets to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
if (is_yb_positive) {
|
if (is_yb_positive) {
|
||||||
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
|
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
|
||||||
} else {
|
} else {
|
||||||
@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
|
|||||||
}
|
}
|
||||||
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up(0).
|
// register) is set to round-to-nearest-up(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
// Blend 50 / 50.
|
// Blend 50 / 50.
|
||||||
if (y1_fraction == 128) {
|
if (y1_fraction == 128) {
|
||||||
do {
|
do {
|
||||||
|
|||||||
@ -139,7 +139,8 @@ void I444ToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %[vl] \n"
|
asm volatile (
|
||||||
|
"cnth %[vl] \n"
|
||||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
"dup z19.b, #255 \n" /* A */
|
"dup z19.b, #255 \n" /* A */
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
@ -181,7 +182,8 @@ void I400ToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %[vl] \n"
|
asm volatile (
|
||||||
|
"cnth %[vl] \n"
|
||||||
"ptrue p0.b \n"
|
"ptrue p0.b \n"
|
||||||
"dup z19.b, #255 \n" // A
|
"dup z19.b, #255 \n" // A
|
||||||
YUVTORGB_SVE_SETUP
|
YUVTORGB_SVE_SETUP
|
||||||
@ -229,7 +231,8 @@ void I422ToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %[vl] \n"
|
asm volatile (
|
||||||
|
"cnth %[vl] \n"
|
||||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
"dup z19.b, #255 \n" /* A */
|
"dup z19.b, #255 \n" /* A */
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
@ -273,7 +276,8 @@ void I422ToRGBARow_SVE2(const uint8_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %[vl] \n"
|
asm volatile (
|
||||||
|
"cnth %[vl] \n"
|
||||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
"dup z19.b, #255 \n" // A
|
"dup z19.b, #255 \n" // A
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
@ -318,7 +322,8 @@ void I444AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %[vl] \n"
|
asm volatile (
|
||||||
|
"cnth %[vl] \n"
|
||||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
"b.lt 2f \n"
|
"b.lt 2f \n"
|
||||||
@ -366,7 +371,8 @@ void I422AlphaToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %[vl] \n"
|
asm volatile (
|
||||||
|
"cnth %[vl] \n"
|
||||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
"b.lt 2f \n"
|
"b.lt 2f \n"
|
||||||
@ -416,11 +422,13 @@ static inline void NVToARGBRow_SVE2(const uint8_t* src_y,
|
|||||||
uint32_t nv_v_start,
|
uint32_t nv_v_start,
|
||||||
uint32_t nv_v_step) {
|
uint32_t nv_v_step) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %0" : "=r"(vl));
|
asm volatile (
|
||||||
|
"cnth %0" : "=r"(vl));
|
||||||
int width_last_y = width & (vl - 1);
|
int width_last_y = width & (vl - 1);
|
||||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||||
asm("ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
asm volatile (
|
||||||
|
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||||
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
||||||
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
||||||
"dup z19.b, #255 \n" // A
|
"dup z19.b, #255 \n" // A
|
||||||
@ -534,7 +542,7 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
|
|||||||
const int16_t* uvconstants) {
|
const int16_t* uvconstants) {
|
||||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"ptrue p0.b \n"
|
"ptrue p0.b \n"
|
||||||
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
|
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
|
||||||
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"
|
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"
|
||||||
@ -746,7 +754,8 @@ void ARGBToRGB565Row_SVE2(const uint8_t* src_argb,
|
|||||||
unsigned bsl_mask = 0x7e0;
|
unsigned bsl_mask = 0x7e0;
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
width *= 2;
|
width *= 2;
|
||||||
asm("mov z3.h, #3 \n"
|
asm volatile (
|
||||||
|
"mov z3.h, #3 \n"
|
||||||
"dup z4.h, %w[bsl_mask] \n"
|
"dup z4.h, %w[bsl_mask] \n"
|
||||||
|
|
||||||
"cntb %[vl] \n"
|
"cntb %[vl] \n"
|
||||||
@ -787,7 +796,8 @@ void ARGBToRGB565DitherRow_SVE2(const uint8_t* src_argb,
|
|||||||
unsigned bsl_mask = 0x7e0;
|
unsigned bsl_mask = 0x7e0;
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
width *= 2;
|
width *= 2;
|
||||||
asm("mov z3.h, #3 \n"
|
asm volatile (
|
||||||
|
"mov z3.h, #3 \n"
|
||||||
"dup z4.h, %w[bsl_mask] \n"
|
"dup z4.h, %w[bsl_mask] \n"
|
||||||
"dup z2.s, %w[dither4] \n"
|
"dup z2.s, %w[dither4] \n"
|
||||||
"zip1 z2.b, z2.b, z2.b \n"
|
"zip1 z2.b, z2.b, z2.b \n"
|
||||||
@ -844,7 +854,8 @@ void ARGB1555ToARGBRow_SVE2(const uint8_t* src_argb1555,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width) {
|
int width) {
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("mov z4.h, #0x0300 \n"
|
asm volatile (
|
||||||
|
"mov z4.h, #0x0300 \n"
|
||||||
"ptrue p0.b \n"
|
"ptrue p0.b \n"
|
||||||
|
|
||||||
"cnth %x[vl] \n"
|
"cnth %x[vl] \n"
|
||||||
@ -912,7 +923,8 @@ void AYUVToUVRow_SVE2(const uint8_t* src_ayuv,
|
|||||||
// Output a row of UV values, filtering 2x2 rows of AYUV.
|
// Output a row of UV values, filtering 2x2 rows of AYUV.
|
||||||
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
|
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
|
||||||
int vl;
|
int vl;
|
||||||
asm("cntb %x[vl] \n"
|
asm volatile (
|
||||||
|
"cntb %x[vl] \n"
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
"b.lt 2f \n"
|
"b.lt 2f \n"
|
||||||
|
|
||||||
@ -950,7 +962,8 @@ void AYUVToVURow_SVE2(const uint8_t* src_ayuv,
|
|||||||
// Output a row of VU values, filtering 2x2 rows of AYUV.
|
// Output a row of VU values, filtering 2x2 rows of AYUV.
|
||||||
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
|
const uint8_t* src_ayuv1 = src_ayuv + src_stride_ayuv;
|
||||||
int vl;
|
int vl;
|
||||||
asm("cntb %x[vl] \n"
|
asm volatile (
|
||||||
|
"cntb %x[vl] \n"
|
||||||
"cmp %w[width], %w[vl] \n"
|
"cmp %w[width], %w[vl] \n"
|
||||||
"subs %w[width], %w[width], %w[vl] \n"
|
"subs %w[width], %w[width], %w[vl] \n"
|
||||||
"b.lt 2f \n"
|
"b.lt 2f \n"
|
||||||
@ -990,10 +1003,12 @@ void YUY2ToARGBRow_SVE2(const uint8_t* src_yuy2,
|
|||||||
uint32_t nv_v_start = 0x0003'0003U;
|
uint32_t nv_v_start = 0x0003'0003U;
|
||||||
uint32_t nv_v_step = 0x0004'0004U;
|
uint32_t nv_v_step = 0x0004'0004U;
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %0" : "=r"(vl));
|
asm volatile (
|
||||||
|
"cnth %0" : "=r"(vl));
|
||||||
int width_last_y = width & (vl - 1);
|
int width_last_y = width & (vl - 1);
|
||||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||||
asm("ptrue p0.b \n"
|
asm volatile (
|
||||||
|
"ptrue p0.b \n"
|
||||||
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
||||||
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
||||||
"dup z19.b, #255 \n" // A
|
"dup z19.b, #255 \n" // A
|
||||||
@ -1047,10 +1062,12 @@ void UYVYToARGBRow_SVE2(const uint8_t* src_uyvy,
|
|||||||
uint32_t nv_v_start = 0x0002'0002U;
|
uint32_t nv_v_start = 0x0002'0002U;
|
||||||
uint32_t nv_v_step = 0x0004'0004U;
|
uint32_t nv_v_step = 0x0004'0004U;
|
||||||
uint64_t vl;
|
uint64_t vl;
|
||||||
asm("cnth %0" : "=r"(vl));
|
asm volatile (
|
||||||
|
"cnth %0" : "=r"(vl));
|
||||||
int width_last_y = width & (vl - 1);
|
int width_last_y = width & (vl - 1);
|
||||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||||
asm("ptrue p0.b \n"
|
asm volatile (
|
||||||
|
"ptrue p0.b \n"
|
||||||
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
"index z22.s, %w[nv_u_start], %w[nv_u_step] \n"
|
||||||
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
"index z23.s, %w[nv_v_start], %w[nv_v_step] \n"
|
||||||
"dup z19.b, #255 \n" // A
|
"dup z19.b, #255 \n" // A
|
||||||
|
|||||||
@ -193,7 +193,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vmovdqu (%0),%%ymm0 \n"
|
"vmovdqu (%0),%%ymm0 \n"
|
||||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||||
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kShuf1), // %1
|
"m"(kShuf1), // %1
|
||||||
"m"(kShuf2) // %2
|
"m"(kShuf2) // %2
|
||||||
);
|
);
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm2 \n"
|
"movdqu 0x10(%0),%%xmm2 \n"
|
||||||
@ -515,7 +515,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kMadd11), // %1
|
"m"(kMadd11), // %1
|
||||||
"m"(kRound34) // %2
|
"m"(kRound34) // %2
|
||||||
);
|
);
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm6 \n"
|
"movdqu (%0),%%xmm6 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
||||||
@ -578,7 +578,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kRound34) // %2
|
"m"(kRound34) // %2
|
||||||
);
|
);
|
||||||
|
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm6 \n"
|
"movdqu (%0),%%xmm6 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm7 \n"
|
||||||
@ -667,7 +667,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kShufAb2), // %2
|
"m"(kShufAb2), // %2
|
||||||
"m"(kScaleAb2) // %3
|
"m"(kScaleAb2) // %3
|
||||||
);
|
);
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm1 \n"
|
||||||
@ -708,7 +708,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
|||||||
"m"(kShufAc3), // %1
|
"m"(kShufAc3), // %1
|
||||||
"m"(kScaleAc33) // %2
|
"m"(kScaleAc33) // %2
|
||||||
);
|
);
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
|
"movdqu 0x00(%0,%3,1),%%xmm6 \n"
|
||||||
@ -821,7 +821,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
ptrdiff_t dst_stride,
|
ptrdiff_t dst_stride,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||||
// above line
|
// above line
|
||||||
@ -1900,7 +1900,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
|
|||||||
int dx) {
|
int dx) {
|
||||||
(void)x;
|
(void)x;
|
||||||
(void)dx;
|
(void)dx;
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%1),%%xmm0 \n"
|
"movdqu (%1),%%xmm0 \n"
|
||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
@ -1925,7 +1925,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm1 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
@ -1947,7 +1947,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm1 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
@ -1971,7 +1971,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n"
|
"movdqu (%0),%%xmm0 \n"
|
||||||
"movdqu 0x10(%0),%%xmm1 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
@ -2153,7 +2153,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
|
|||||||
int dx) {
|
int dx) {
|
||||||
(void)x;
|
(void)x;
|
||||||
(void)dx;
|
(void)dx;
|
||||||
asm(LABELALIGN
|
asm volatile (LABELALIGN
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%1),%%xmm0 \n"
|
"movdqu (%1),%%xmm0 \n"
|
||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
|
|||||||
@ -28,7 +28,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// load even pixels into q0, odd into q1
|
// load even pixels into q0, odd into q1
|
||||||
"vld2.8 {q0, q1}, [%0]! \n"
|
"vld2.8 {q0, q1}, [%0]! \n"
|
||||||
@ -49,7 +49,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
|
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
|
||||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||||
@ -69,7 +69,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %0 \n"
|
"add %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -100,7 +100,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop
|
"subs %2, %2, #8 \n" // 8 processed per loop
|
||||||
@ -120,7 +120,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||||
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
||||||
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {q0}, [%0]! \n" // load up 16x4
|
"vld1.8 {q0}, [%0]! \n" // load up 16x4
|
||||||
"vld1.8 {q1}, [%3]! \n"
|
"vld1.8 {q1}, [%3]! \n"
|
||||||
@ -154,7 +154,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
|
||||||
"subs %2, %2, #24 \n"
|
"subs %2, %2, #24 \n"
|
||||||
@ -172,7 +172,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d24, #3 \n"
|
"vmov.u8 d24, #3 \n"
|
||||||
"add %3, %0 \n"
|
"add %3, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -229,7 +229,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d24, #3 \n"
|
"vmov.u8 d24, #3 \n"
|
||||||
"add %3, %0 \n"
|
"add %3, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -281,7 +281,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.8 {q3}, [%3] \n"
|
"vld1.8 {q3}, [%3] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
|
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
|
||||||
@ -305,7 +305,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
|
|||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
|
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.16 {q13}, [%5] \n"
|
"vld1.16 {q13}, [%5] \n"
|
||||||
"vld1.8 {q14}, [%6] \n"
|
"vld1.8 {q14}, [%6] \n"
|
||||||
"vld1.8 {q15}, [%7] \n"
|
"vld1.8 {q15}, [%7] \n"
|
||||||
@ -415,7 +415,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vld1.16 {q13}, [%4] \n"
|
"vld1.16 {q13}, [%4] \n"
|
||||||
"vld1.8 {q14}, [%5] \n"
|
"vld1.8 {q14}, [%5] \n"
|
||||||
"add %3, %0 \n"
|
"add %3, %0 \n"
|
||||||
@ -508,7 +508,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint8_t* src_temp = src_ptr + 1;
|
const uint8_t* src_temp = src_ptr + 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d30, #3 \n"
|
"vmov.u8 d30, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -545,7 +545,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_temp = src_ptr + 1;
|
const uint8_t* src_temp = src_ptr + 1;
|
||||||
const uint8_t* src_temp1 = src_ptr1 + 1;
|
const uint8_t* src_temp1 = src_ptr1 + 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 q15, #3 \n"
|
"vmov.u16 q15, #3 \n"
|
||||||
"vmov.u8 d28, #3 \n"
|
"vmov.u8 d28, #3 \n"
|
||||||
|
|
||||||
@ -607,7 +607,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 q15, #3 \n"
|
"vmov.u16 q15, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -643,7 +643,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
|
|||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 q15, #3 \n"
|
"vmov.u16 q15, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -694,7 +694,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 d31, #3 \n"
|
"vmov.u16 d31, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -738,7 +738,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
|||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 d31, #3 \n"
|
"vmov.u16 d31, #3 \n"
|
||||||
"vmov.u32 q14, #3 \n"
|
"vmov.u32 q14, #3 \n"
|
||||||
|
|
||||||
@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint8_t* src_temp = src_ptr + 2;
|
const uint8_t* src_temp = src_ptr + 2;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u8 d30, #3 \n"
|
"vmov.u8 d30, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -827,7 +827,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_temp = src_ptr + 2;
|
const uint8_t* src_temp = src_ptr + 2;
|
||||||
const uint8_t* src_temp1 = src_ptr1 + 2;
|
const uint8_t* src_temp1 = src_ptr1 + 2;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 q15, #3 \n"
|
"vmov.u16 q15, #3 \n"
|
||||||
"vmov.u8 d28, #3 \n"
|
"vmov.u8 d28, #3 \n"
|
||||||
|
|
||||||
@ -889,7 +889,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint16_t* src_temp = src_ptr + 2;
|
const uint16_t* src_temp = src_ptr + 2;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 d30, #3 \n"
|
"vmov.u16 d30, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -934,7 +934,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
|||||||
const uint16_t* src_temp = src_ptr + 2;
|
const uint16_t* src_temp = src_ptr + 2;
|
||||||
const uint16_t* src_temp1 = src_ptr1 + 2;
|
const uint16_t* src_temp1 = src_ptr1 + 2;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"vmov.u16 d30, #3 \n"
|
"vmov.u16 d30, #3 \n"
|
||||||
"vmov.u32 q14, #3 \n"
|
"vmov.u32 q14, #3 \n"
|
||||||
|
|
||||||
@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
|||||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int src_width) {
|
int src_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
|
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
|
||||||
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
|
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
|
||||||
@ -1086,7 +1086,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
int dst_width,
|
int dst_width,
|
||||||
int source_y_fraction) {
|
int source_y_fraction) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"cmp %4, #0 \n"
|
"cmp %4, #0 \n"
|
||||||
"beq 100f \n"
|
"beq 100f \n"
|
||||||
"add %2, %1 \n"
|
"add %2, %1 \n"
|
||||||
@ -1170,7 +1170,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||||
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||||
@ -1198,7 +1198,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||||
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
|
||||||
@ -1219,7 +1219,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1258,7 +1258,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"mov r12, %3, lsl #2 \n"
|
"mov r12, %3, lsl #2 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.32 {d0[0]}, [%0], r12 \n"
|
"vld1.32 {d0[0]}, [%0], r12 \n"
|
||||||
@ -1282,7 +1282,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
|
|||||||
int src_stepx,
|
int src_stepx,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"mov r12, %4, lsl #2 \n"
|
"mov r12, %4, lsl #2 \n"
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1330,7 +1330,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
|
|||||||
int dx) {
|
int dx) {
|
||||||
int tmp;
|
int tmp;
|
||||||
const uint8_t* src_tmp = src_argb;
|
const uint8_t* src_tmp = src_argb;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// clang-format off
|
// clang-format off
|
||||||
LOAD1_DATA32_LANE(d0, 0)
|
LOAD1_DATA32_LANE(d0, 0)
|
||||||
@ -1433,7 +1433,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||||
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||||
@ -1452,7 +1452,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||||
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||||
@ -1471,7 +1471,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1506,7 +1506,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
|
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
|
||||||
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
|
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {d0[0]}, [%0], %6 \n"
|
"vld1.16 {d0[0]}, [%0], %6 \n"
|
||||||
"vld1.16 {d0[1]}, [%1], %6 \n"
|
"vld1.16 {d0[1]}, [%1], %6 \n"
|
||||||
|
|||||||
@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// load even pixels into v0, odd into v1
|
// load even pixels into v0, odd into v1
|
||||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||||
@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// load even pixels into v0, odd into v1
|
// load even pixels into v0, odd into v1
|
||||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
"ld2 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||||
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||||
@ -122,7 +122,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||||
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
|
||||||
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
|
"ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
|
||||||
"ld1 {v1.16b}, [%2], #16 \n"
|
"ld1 {v1.16b}, [%2], #16 \n"
|
||||||
@ -159,7 +159,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
|
||||||
"subs %w2, %w2, #24 \n"
|
"subs %w2, %w2, #24 \n"
|
||||||
@ -178,7 +178,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v20.8b, #3 \n"
|
"movi v20.8b, #3 \n"
|
||||||
"add %3, %3, %0 \n"
|
"add %3, %3, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -237,7 +237,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v20.8b, #3 \n"
|
"movi v20.8b, #3 \n"
|
||||||
"add %3, %3, %0 \n"
|
"add %3, %3, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -292,7 +292,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"ld1 {v3.16b}, [%3] \n"
|
"ld1 {v3.16b}, [%3] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
|
"ld1 {v0.16b,v1.16b}, [%0], #32 \n"
|
||||||
@ -317,7 +317,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
|
const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
|
||||||
ptrdiff_t tmp_src_stride = src_stride;
|
ptrdiff_t tmp_src_stride = src_stride;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"ld1 {v29.8h}, [%5] \n"
|
"ld1 {v29.8h}, [%5] \n"
|
||||||
"ld1 {v30.16b}, [%6] \n"
|
"ld1 {v30.16b}, [%6] \n"
|
||||||
"ld1 {v31.8h}, [%7] \n"
|
"ld1 {v31.8h}, [%7] \n"
|
||||||
@ -439,7 +439,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
|||||||
int dst_width) {
|
int dst_width) {
|
||||||
// TODO(fbarchard): use src_stride directly for clang 3.5+.
|
// TODO(fbarchard): use src_stride directly for clang 3.5+.
|
||||||
ptrdiff_t tmp_src_stride = src_stride;
|
ptrdiff_t tmp_src_stride = src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"ld1 {v30.8h}, [%4] \n"
|
"ld1 {v30.8h}, [%4] \n"
|
||||||
"ld1 {v31.16b}, [%5] \n"
|
"ld1 {v31.16b}, [%5] \n"
|
||||||
"add %2, %2, %0 \n"
|
"add %2, %2, %0 \n"
|
||||||
@ -539,7 +539,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint8_t* src_temp = src_ptr + 1;
|
const uint8_t* src_temp = src_ptr + 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8b, #3 \n"
|
"movi v31.8b, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -578,7 +578,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_temp = src_ptr + 1;
|
const uint8_t* src_temp = src_ptr + 1;
|
||||||
const uint8_t* src_temp1 = src_ptr1 + 1;
|
const uint8_t* src_temp1 = src_ptr1 + 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8b, #3 \n"
|
"movi v31.8b, #3 \n"
|
||||||
"movi v30.8h, #3 \n"
|
"movi v30.8h, #3 \n"
|
||||||
|
|
||||||
@ -634,7 +634,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8h, #3 \n"
|
"movi v31.8h, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -671,7 +671,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
|
|||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8h, #3 \n"
|
"movi v31.8h, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -725,7 +725,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8h, #3 \n"
|
"movi v31.8h, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -770,7 +770,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
|||||||
const uint16_t* src_temp = src_ptr + 1;
|
const uint16_t* src_temp = src_ptr + 1;
|
||||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.4h, #3 \n"
|
"movi v31.4h, #3 \n"
|
||||||
"movi v30.4s, #3 \n"
|
"movi v30.4s, #3 \n"
|
||||||
|
|
||||||
@ -825,7 +825,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst_ptr,
|
uint8_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint8_t* src_temp = src_ptr + 2;
|
const uint8_t* src_temp = src_ptr + 2;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8b, #3 \n"
|
"movi v31.8b, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -864,7 +864,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src_temp = src_ptr + 2;
|
const uint8_t* src_temp = src_ptr + 2;
|
||||||
const uint8_t* src_temp1 = src_ptr1 + 2;
|
const uint8_t* src_temp1 = src_ptr1 + 2;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8b, #3 \n"
|
"movi v31.8b, #3 \n"
|
||||||
"movi v30.8h, #3 \n"
|
"movi v30.8h, #3 \n"
|
||||||
|
|
||||||
@ -920,7 +920,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
|||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint16_t* src_temp = src_ptr + 2;
|
const uint16_t* src_temp = src_ptr + 2;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.8h, #3 \n"
|
"movi v31.8h, #3 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -967,7 +967,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
|||||||
const uint16_t* src_temp = src_ptr + 2;
|
const uint16_t* src_temp = src_ptr + 2;
|
||||||
const uint16_t* src_temp1 = src_ptr1 + 2;
|
const uint16_t* src_temp1 = src_ptr1 + 2;
|
||||||
|
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"movi v31.4h, #3 \n"
|
"movi v31.4h, #3 \n"
|
||||||
"movi v30.4s, #3 \n"
|
"movi v30.4s, #3 \n"
|
||||||
|
|
||||||
@ -1022,7 +1022,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
|||||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||||
uint16_t* dst_ptr,
|
uint16_t* dst_ptr,
|
||||||
int src_width) {
|
int src_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
|
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
|
||||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
|
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
|
||||||
@ -1123,7 +1123,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
|
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
|
||||||
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
|
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
|
||||||
@ -1145,7 +1145,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
|
|||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
|
// load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
|
||||||
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
|
"ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
|
||||||
@ -1169,7 +1169,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
|
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
|
||||||
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
|
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
|
||||||
@ -1200,7 +1200,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
|
|||||||
const uint8_t* src_argb3 = src_argb + src_stepx * 12;
|
const uint8_t* src_argb3 = src_argb + src_stepx * 12;
|
||||||
int64_t i = 0;
|
int64_t i = 0;
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ldr w10, [%[src], %[i]] \n"
|
"ldr w10, [%[src], %[i]] \n"
|
||||||
"ldr w11, [%[src1], %[i]] \n"
|
"ldr w11, [%[src1], %[i]] \n"
|
||||||
@ -1232,7 +1232,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
|
|||||||
int src_stepx,
|
int src_stepx,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
|
"ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
|
||||||
@ -1287,7 +1287,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
|
|||||||
int64_t x64 = (int64_t)x; // NOLINT
|
int64_t x64 = (int64_t)x; // NOLINT
|
||||||
int64_t dx64 = (int64_t)dx; // NOLINT
|
int64_t dx64 = (int64_t)dx; // NOLINT
|
||||||
int64_t tmp64;
|
int64_t tmp64;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// clang-format off
|
// clang-format off
|
||||||
LOAD1_DATA32_LANE(v0, 0)
|
LOAD1_DATA32_LANE(v0, 0)
|
||||||
@ -1394,7 +1394,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
|
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1426,7 +1426,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint16_t* dst,
|
uint16_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
|
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
|
||||||
"movi v0.8h, #9 \n" // constants
|
"movi v0.8h, #9 \n" // constants
|
||||||
"movi v1.4s, #3 \n"
|
"movi v1.4s, #3 \n"
|
||||||
@ -1477,7 +1477,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||||
@ -1496,7 +1496,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
|||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||||
@ -1515,7 +1515,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
|||||||
ptrdiff_t src_stride,
|
ptrdiff_t src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_width) {
|
int dst_width) {
|
||||||
asm volatile(
|
asm volatile (
|
||||||
// change the stride to row 2 pointer
|
// change the stride to row 2 pointer
|
||||||
"add %1, %1, %0 \n"
|
"add %1, %1, %0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
@ -1550,7 +1550,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
|
|||||||
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
|
const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
|
||||||
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
|
const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
|
||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
asm volatile(
|
asm volatile (
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v0.h}[0], [%0], %6 \n"
|
"ld1 {v0.h}[0], [%0], %6 \n"
|
||||||
"ld1 {v1.h}[0], [%1], %6 \n"
|
"ld1 {v1.h}[0], [%1], %6 \n"
|
||||||
|
|||||||
@ -100,7 +100,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
|
|||||||
const uint32_t* src = (const uint32_t*)(src_argb);
|
const uint32_t* src = (const uint32_t*)(src_argb);
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m4_t v_odd, v_even, v_dst;
|
vuint8m4_t v_odd, v_even, v_dst;
|
||||||
vuint32m4_t v_odd_32, v_even_32;
|
vuint32m4_t v_odd_32, v_even_32;
|
||||||
@ -165,7 +165,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
|
|||||||
const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
|
const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
|
vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
|
||||||
vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
|
vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
|
||||||
@ -262,7 +262,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
|
|||||||
const int stride_byte = src_stepx * 4;
|
const int stride_byte = src_stepx * 4;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
|
vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
|
||||||
vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
|
vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
|
||||||
@ -340,7 +340,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
|
|||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m4_t v_s0, v_s1, v_dst;
|
vuint8m4_t v_s0, v_s1, v_dst;
|
||||||
size_t vl = __riscv_vsetvl_e8m4(w);
|
size_t vl = __riscv_vsetvl_e8m4(w);
|
||||||
@ -395,7 +395,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
|
|||||||
size_t w = (size_t)dst_width;
|
size_t w = (size_t)dst_width;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
size_t vl = __riscv_vsetvl_e8m4(w);
|
size_t vl = __riscv_vsetvl_e8m4(w);
|
||||||
vuint8m4_t v_s0, v_s1, v_t0, v_t1;
|
vuint8m4_t v_s0, v_s1, v_t0, v_t1;
|
||||||
@ -528,7 +528,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
|
|||||||
size_t w = (size_t)dst_width;
|
size_t w = (size_t)dst_width;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||||
vuint8m2_t v_t0, v_t1, v_t2, v_t3;
|
vuint8m2_t v_t0, v_t1, v_t2, v_t3;
|
||||||
@ -698,7 +698,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
|
|||||||
const uint8_t* t = src_ptr + src_stride;
|
const uint8_t* t = src_ptr + src_stride;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||||
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
|
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
|
||||||
@ -827,7 +827,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
|
|||||||
const uint8_t* t = src_ptr + src_stride;
|
const uint8_t* t = src_ptr + src_stride;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
vuint8m2_t v_s0, v_s1, v_s2, v_s3;
|
||||||
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
|
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
|
||||||
@ -1490,7 +1490,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
|
|||||||
(void)src_stride;
|
(void)src_stride;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m4_t v_u0v0, v_u1v1, v_avg;
|
vuint8m4_t v_u0v0, v_u1v1, v_avg;
|
||||||
vuint16m4_t v_u0v0_16, v_u1v1_16;
|
vuint16m4_t v_u0v0_16, v_u1v1_16;
|
||||||
@ -1559,7 +1559,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
|
|||||||
size_t w = (size_t)dst_width;
|
size_t w = (size_t)dst_width;
|
||||||
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
|
||||||
// register) is set to round-to-nearest-up mode(0).
|
// register) is set to round-to-nearest-up mode(0).
|
||||||
asm volatile("csrwi vxrm, 0");
|
asm volatile ("csrwi vxrm, 0");
|
||||||
do {
|
do {
|
||||||
vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
|
vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
|
||||||
vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
|
vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user