Scale by even factor low level row function

Bug: b/171884264
Change-Id: I6a94bde0aa05e681bb4590ea8beec33a61ddbfc9
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2518361
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-11-03 11:25:56 -08:00 committed by Commit Bot
parent f014dbd87a
commit b7a1c5ee5d
17 changed files with 7698 additions and 7554 deletions

View File

@ -113,6 +113,7 @@ extern "C" {
#define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN38_NEON
#define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON
#define HAS_SCALEUVROWDOWNEVEN_NEON
#endif #endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

View File

@ -29,38 +29,38 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
uint64_t diff = 0u; uint64_t diff = 0u;
asm volatile( asm volatile(
"xor %3,%3 \n" "xor %3,%3 \n"
"xor %%r8,%%r8 \n" "xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n" "xor %%r9,%%r9 \n"
"xor %%r10,%%r10 \n" "xor %%r10,%%r10 \n"
// Process 32 bytes per loop. // Process 32 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"mov (%0),%%rcx \n" "mov (%0),%%rcx \n"
"mov 0x8(%0),%%rdx \n" "mov 0x8(%0),%%rdx \n"
"xor (%1),%%rcx \n" "xor (%1),%%rcx \n"
"xor 0x8(%1),%%rdx \n" "xor 0x8(%1),%%rdx \n"
"popcnt %%rcx,%%rcx \n" "popcnt %%rcx,%%rcx \n"
"popcnt %%rdx,%%rdx \n" "popcnt %%rdx,%%rdx \n"
"mov 0x10(%0),%%rsi \n" "mov 0x10(%0),%%rsi \n"
"mov 0x18(%0),%%rdi \n" "mov 0x18(%0),%%rdi \n"
"xor 0x10(%1),%%rsi \n" "xor 0x10(%1),%%rsi \n"
"xor 0x18(%1),%%rdi \n" "xor 0x18(%1),%%rdi \n"
"popcnt %%rsi,%%rsi \n" "popcnt %%rsi,%%rsi \n"
"popcnt %%rdi,%%rdi \n" "popcnt %%rdi,%%rdi \n"
"add $0x20,%0 \n" "add $0x20,%0 \n"
"add $0x20,%1 \n" "add $0x20,%1 \n"
"add %%rcx,%3 \n" "add %%rcx,%3 \n"
"add %%rdx,%%r8 \n" "add %%rdx,%%r8 \n"
"add %%rsi,%%r9 \n" "add %%rsi,%%r9 \n"
"add %%rdi,%%r10 \n" "add %%rdi,%%r10 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"add %%r8, %3 \n" "add %%r8, %3 \n"
"add %%r9, %3 \n" "add %%r9, %3 \n"
"add %%r10, %3 \n" "add %%r10, %3 \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
"+r"(count), // %2 "+r"(count), // %2
@ -80,26 +80,26 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
// Process 16 bytes per loop. // Process 16 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"mov (%0),%%ecx \n" "mov (%0),%%ecx \n"
"mov 0x4(%0),%%edx \n" "mov 0x4(%0),%%edx \n"
"xor (%1),%%ecx \n" "xor (%1),%%ecx \n"
"xor 0x4(%1),%%edx \n" "xor 0x4(%1),%%edx \n"
"popcnt %%ecx,%%ecx \n" "popcnt %%ecx,%%ecx \n"
"add %%ecx,%3 \n" "add %%ecx,%3 \n"
"popcnt %%edx,%%edx \n" "popcnt %%edx,%%edx \n"
"add %%edx,%3 \n" "add %%edx,%3 \n"
"mov 0x8(%0),%%ecx \n" "mov 0x8(%0),%%ecx \n"
"mov 0xc(%0),%%edx \n" "mov 0xc(%0),%%edx \n"
"xor 0x8(%1),%%ecx \n" "xor 0x8(%1),%%ecx \n"
"xor 0xc(%1),%%edx \n" "xor 0xc(%1),%%edx \n"
"popcnt %%ecx,%%ecx \n" "popcnt %%ecx,%%ecx \n"
"add %%ecx,%3 \n" "add %%ecx,%3 \n"
"popcnt %%edx,%%edx \n" "popcnt %%edx,%%edx \n"
"add %%edx,%3 \n" "add %%edx,%3 \n"
"add $0x10,%0 \n" "add $0x10,%0 \n"
"add $0x10,%1 \n" "add $0x10,%1 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
"+r"(count), // %2 "+r"(count), // %2
@ -121,46 +121,46 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
uint32_t diff = 0u; uint32_t diff = 0u;
asm volatile( asm volatile(
"movdqa %4,%%xmm2 \n" "movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n" "movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm1,%%xmm1 \n" "pxor %%xmm1,%%xmm1 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqa (%0),%%xmm4 \n" "movdqa (%0),%%xmm4 \n"
"movdqa 0x10(%0), %%xmm5 \n" "movdqa 0x10(%0), %%xmm5 \n"
"pxor (%0,%1), %%xmm4 \n" "pxor (%0,%1), %%xmm4 \n"
"movdqa %%xmm4,%%xmm6 \n" "movdqa %%xmm4,%%xmm6 \n"
"pand %%xmm2,%%xmm6 \n" "pand %%xmm2,%%xmm6 \n"
"psrlw $0x4,%%xmm4 \n" "psrlw $0x4,%%xmm4 \n"
"movdqa %%xmm3,%%xmm7 \n" "movdqa %%xmm3,%%xmm7 \n"
"pshufb %%xmm6,%%xmm7 \n" "pshufb %%xmm6,%%xmm7 \n"
"pand %%xmm2,%%xmm4 \n" "pand %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm6 \n" "movdqa %%xmm3,%%xmm6 \n"
"pshufb %%xmm4,%%xmm6 \n" "pshufb %%xmm4,%%xmm6 \n"
"paddb %%xmm7,%%xmm6 \n" "paddb %%xmm7,%%xmm6 \n"
"pxor 0x10(%0,%1),%%xmm5 \n" "pxor 0x10(%0,%1),%%xmm5 \n"
"add $0x20,%0 \n" "add $0x20,%0 \n"
"movdqa %%xmm5,%%xmm4 \n" "movdqa %%xmm5,%%xmm4 \n"
"pand %%xmm2,%%xmm5 \n" "pand %%xmm2,%%xmm5 \n"
"psrlw $0x4,%%xmm4 \n" "psrlw $0x4,%%xmm4 \n"
"movdqa %%xmm3,%%xmm7 \n" "movdqa %%xmm3,%%xmm7 \n"
"pshufb %%xmm5,%%xmm7 \n" "pshufb %%xmm5,%%xmm7 \n"
"pand %%xmm2,%%xmm4 \n" "pand %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm5 \n" "movdqa %%xmm3,%%xmm5 \n"
"pshufb %%xmm4,%%xmm5 \n" "pshufb %%xmm4,%%xmm5 \n"
"paddb %%xmm7,%%xmm5 \n" "paddb %%xmm7,%%xmm5 \n"
"paddb %%xmm5,%%xmm6 \n" "paddb %%xmm5,%%xmm6 \n"
"psadbw %%xmm1,%%xmm6 \n" "psadbw %%xmm1,%%xmm6 \n"
"paddd %%xmm6,%%xmm0 \n" "paddd %%xmm6,%%xmm0 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"pshufd $0xaa,%%xmm0,%%xmm1 \n" "pshufd $0xaa,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0, %3 \n" "movd %%xmm0, %3 \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
"+r"(count), // %2 "+r"(count), // %2
@ -182,40 +182,40 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
asm volatile( asm volatile(
"vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n" "vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
"vpxor %%ymm1,%%ymm1,%%ymm1 \n" "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"vmovdqa (%0),%%ymm4 \n" "vmovdqa (%0),%%ymm4 \n"
"vmovdqa 0x20(%0), %%ymm5 \n" "vmovdqa 0x20(%0), %%ymm5 \n"
"vpxor (%0,%1), %%ymm4, %%ymm4 \n" "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
"vpand %%ymm2,%%ymm4,%%ymm6 \n" "vpand %%ymm2,%%ymm4,%%ymm6 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n" "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm6 \n" "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n" "vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n" "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm4,%%ymm6,%%ymm6 \n" "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
"vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
"add $0x40,%0 \n" "add $0x40,%0 \n"
"vpand %%ymm2,%%ymm4,%%ymm5 \n" "vpand %%ymm2,%%ymm4,%%ymm5 \n"
"vpsrlw $0x4,%%ymm4,%%ymm4 \n" "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm5,%%ymm3,%%ymm5 \n" "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
"vpand %%ymm2,%%ymm4,%%ymm4 \n" "vpand %%ymm2,%%ymm4,%%ymm4 \n"
"vpshufb %%ymm4,%%ymm3,%%ymm4 \n" "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
"vpaddb %%ymm5,%%ymm4,%%ymm4 \n" "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
"vpaddb %%ymm6,%%ymm4,%%ymm4 \n" "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
"vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
"vpaddd %%ymm0,%%ymm4,%%ymm0 \n" "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
"sub $0x40,%2 \n" "sub $0x40,%2 \n"
"jg 1b \n" "jg 1b \n"
"vpermq $0xb1,%%ymm0,%%ymm1 \n" "vpermq $0xb1,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xaa,%%ymm0,%%ymm1 \n" "vpermq $0xaa,%%ymm0,%%ymm1 \n"
"vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
"vmovd %%xmm0, %3 \n" "vmovd %%xmm0, %3 \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
@ -234,34 +234,34 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile( asm volatile(
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm1 \n" "movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"movdqu (%1),%%xmm2 \n" "movdqu (%1),%%xmm2 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm2,%%xmm1 \n"
"psubusb %%xmm3,%%xmm2 \n" "psubusb %%xmm3,%%xmm2 \n"
"por %%xmm2,%%xmm1 \n" "por %%xmm2,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n" "movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm1 \n"
"punpckhbw %%xmm5,%%xmm2 \n" "punpckhbw %%xmm5,%%xmm2 \n"
"pmaddwd %%xmm1,%%xmm1 \n" "pmaddwd %%xmm1,%%xmm1 \n"
"pmaddwd %%xmm2,%%xmm2 \n" "pmaddwd %%xmm2,%%xmm2 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"paddd %%xmm2,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"jg 1b \n" "jg 1b \n"
"pshufd $0xee,%%xmm0,%%xmm1 \n" "pshufd $0xee,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"pshufd $0x1,%%xmm0,%%xmm1 \n" "pshufd $0x1,%%xmm0,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"movd %%xmm0,%3 \n" "movd %%xmm0,%3 \n"
: "+r"(src_a), // %0 : "+r"(src_a), // %0
"+r"(src_b), // %1 "+r"(src_b), // %1
@ -301,44 +301,44 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash; uint32_t hash;
asm volatile( asm volatile(
"movd %2,%%xmm0 \n" "movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm1 \n" "movdqu (%0),%%xmm1 \n"
"lea 0x10(%0),%0 \n" "lea 0x10(%0),%0 \n"
"pmulld %%xmm6,%%xmm0 \n" "pmulld %%xmm6,%%xmm0 \n"
"movdqa %5,%%xmm5 \n" "movdqa %5,%%xmm5 \n"
"movdqa %%xmm1,%%xmm2 \n" "movdqa %%xmm1,%%xmm2 \n"
"punpcklbw %%xmm7,%%xmm2 \n" "punpcklbw %%xmm7,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"punpcklwd %%xmm7,%%xmm3 \n" "punpcklwd %%xmm7,%%xmm3 \n"
"pmulld %%xmm5,%%xmm3 \n" "pmulld %%xmm5,%%xmm3 \n"
"movdqa %6,%%xmm5 \n" "movdqa %6,%%xmm5 \n"
"movdqa %%xmm2,%%xmm4 \n" "movdqa %%xmm2,%%xmm4 \n"
"punpckhwd %%xmm7,%%xmm4 \n" "punpckhwd %%xmm7,%%xmm4 \n"
"pmulld %%xmm5,%%xmm4 \n" "pmulld %%xmm5,%%xmm4 \n"
"movdqa %7,%%xmm5 \n" "movdqa %7,%%xmm5 \n"
"punpckhbw %%xmm7,%%xmm1 \n" "punpckhbw %%xmm7,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n" "movdqa %%xmm1,%%xmm2 \n"
"punpcklwd %%xmm7,%%xmm2 \n" "punpcklwd %%xmm7,%%xmm2 \n"
"pmulld %%xmm5,%%xmm2 \n" "pmulld %%xmm5,%%xmm2 \n"
"movdqa %8,%%xmm5 \n" "movdqa %8,%%xmm5 \n"
"punpckhwd %%xmm7,%%xmm1 \n" "punpckhwd %%xmm7,%%xmm1 \n"
"pmulld %%xmm5,%%xmm1 \n" "pmulld %%xmm5,%%xmm1 \n"
"paddd %%xmm4,%%xmm3 \n" "paddd %%xmm4,%%xmm3 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm3,%%xmm1 \n" "paddd %%xmm3,%%xmm1 \n"
"pshufd $0xe,%%xmm1,%%xmm2 \n" "pshufd $0xe,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"pshufd $0x1,%%xmm1,%%xmm2 \n" "pshufd $0x1,%%xmm1,%%xmm2 \n"
"paddd %%xmm2,%%xmm1 \n" "paddd %%xmm2,%%xmm1 \n"
"paddd %%xmm1,%%xmm0 \n" "paddd %%xmm1,%%xmm0 \n"
"sub $0x10,%1 \n" "sub $0x10,%1 \n"
"jg 1b \n" "jg 1b \n"
"movd %%xmm0,%3 \n" "movd %%xmm0,%3 \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(count), // %1 "+r"(count), // %1
"+rm"(seed), // %2 "+rm"(seed), // %2

View File

@ -29,24 +29,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
uint32_t diff; uint32_t diff;
asm volatile( asm volatile(
"vmov.u16 q4, #0 \n" // accumulator "vmov.u16 q4, #0 \n" // accumulator
"1: \n" "1: \n"
"vld1.8 {q0, q1}, [%0]! \n" "vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n" "vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n" "veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n" "veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n" "vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n" "vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n" "subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts "vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts "vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n" "bgt 1b \n"
"vpaddl.u16 q0, q4 \n" // 4 ints "vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n" "vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n" "vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n" "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
: :
@ -59,29 +59,29 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile( asm volatile(
"vmov.u8 q8, #0 \n" "vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n" "vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n" "vmov.u8 q11, #0 \n"
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n" "vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n" "subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n" "vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n" "vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n" "vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n" "vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n" "vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n" "bgt 1b \n"
"vadd.u32 q8, q8, q9 \n" "vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n" "vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n" "vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n" "vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n" "vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n" "vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
: :
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");

View File

@ -27,24 +27,24 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile( asm volatile(
"movi v4.8h, #0 \n" "movi v4.8h, #0 \n"
"1: \n" "1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n" "eor v0.16b, v0.16b, v2.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"eor v1.16b, v1.16b, v3.16b \n" "eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n" "cnt v0.16b, v0.16b \n"
"prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%1, 448] \n"
"cnt v1.16b, v1.16b \n" "cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n" "subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n" "add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n" "uadalp v4.8h, v0.16b \n"
"b.gt 1b \n" "b.gt 1b \n"
"uaddlv s4, v4.8h \n" "uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n" "fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
: :
: "cc", "v0", "v1", "v2", "v3", "v4"); : "cc", "v0", "v1", "v2", "v3", "v4");
@ -56,30 +56,30 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile( asm volatile(
"eor v16.16b, v16.16b, v16.16b \n" "eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n" "eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n" "eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n" "eor v19.16b, v19.16b, v19.16b \n"
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" "ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n" "subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n" "usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n" "usubl2 v3.8h, v0.16b, v1.16b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"smlal v16.4s, v2.4h, v2.4h \n" "smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n" "smlal v17.4s, v3.4h, v3.4h \n"
"prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%1, 448] \n"
"smlal2 v18.4s, v2.8h, v2.8h \n" "smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n" "smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n" "b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n" "add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n" "add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n" "add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n" "addv s0, v19.4s \n"
"fmov %w3, s0 \n" "fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
: :
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");

View File

@ -75,9 +75,9 @@ void CpuId(int info_eax, int info_ecx, int* cpu_info) {
asm volatile( asm volatile(
#if defined(__i386__) && defined(__PIC__) #if defined(__i386__) && defined(__PIC__)
// Preserve ebx for fpic 32 bit. // Preserve ebx for fpic 32 bit.
"mov %%ebx, %%edi \n" "mov %%ebx, %%edi \n"
"cpuid \n" "cpuid \n"
"xchg %%edi, %%ebx \n" "xchg %%edi, %%ebx \n"
: "=D"(info_ebx), : "=D"(info_ebx),
#else #else
"cpuid \n" "cpuid \n"

View File

@ -31,75 +31,75 @@ void TransposeWx8_SSSE3(const uint8_t* src,
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movq (%0),%%xmm0 \n" "movq (%0),%%xmm0 \n"
"movq (%0,%3),%%xmm1 \n" "movq (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"movq (%0),%%xmm2 \n" "movq (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm1,%%xmm1 \n"
"movq (%0,%3),%%xmm3 \n" "movq (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"movq (%0),%%xmm4 \n" "movq (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"movq (%0,%3),%%xmm5 \n" "movq (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm4,%%xmm5 \n"
"movq (%0),%%xmm6 \n" "movq (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"movq (%0,%3),%%xmm7 \n" "movq (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
"neg %3 \n" "neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n" "movdqa %%xmm6,%%xmm7 \n"
"lea 0x8(%0,%3,8),%0 \n" "lea 0x8(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"neg %3 \n" "neg %3 \n"
// Second round of bit swap. // Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n" "palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n" "punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n" "movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n" "movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
// Third round of bit swap. // Third round of bit swap.
// Write to the destination pointer. // Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n" "punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n" "palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n" "movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n" "punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n" "movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n" "movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n" "punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n" "movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n" "movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n" "movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n" "movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n" "punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n" "movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n" "movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
"movq %%xmm7,(%1,%4) \n" "movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
@ -121,127 +121,127 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%3),%%xmm1 \n" "movdqu (%0,%3),%%xmm1 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n" "punpckhbw %%xmm1,%%xmm8 \n"
"movdqu (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
"movdqa %%xmm8,%%xmm9 \n" "movdqa %%xmm8,%%xmm9 \n"
"palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm1,%%xmm1 \n"
"palignr $0x8,%%xmm9,%%xmm9 \n" "palignr $0x8,%%xmm9,%%xmm9 \n"
"movdqu (%0,%3),%%xmm3 \n" "movdqu (%0,%3),%%xmm3 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm2,%%xmm10 \n" "movdqa %%xmm2,%%xmm10 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm10 \n" "punpckhbw %%xmm3,%%xmm10 \n"
"movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm2,%%xmm3 \n"
"movdqa %%xmm10,%%xmm11 \n" "movdqa %%xmm10,%%xmm11 \n"
"movdqu (%0),%%xmm4 \n" "movdqu (%0),%%xmm4 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n" "palignr $0x8,%%xmm11,%%xmm11 \n"
"movdqu (%0,%3),%%xmm5 \n" "movdqu (%0,%3),%%xmm5 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm4,%%xmm12 \n" "movdqa %%xmm4,%%xmm12 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm12 \n" "punpckhbw %%xmm5,%%xmm12 \n"
"movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm4,%%xmm5 \n"
"movdqa %%xmm12,%%xmm13 \n" "movdqa %%xmm12,%%xmm13 \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"palignr $0x8,%%xmm13,%%xmm13 \n" "palignr $0x8,%%xmm13,%%xmm13 \n"
"movdqu (%0,%3),%%xmm7 \n" "movdqu (%0,%3),%%xmm7 \n"
"lea (%0,%3,2),%0 \n" "lea (%0,%3,2),%0 \n"
"movdqa %%xmm6,%%xmm14 \n" "movdqa %%xmm6,%%xmm14 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
"punpckhbw %%xmm7,%%xmm14 \n" "punpckhbw %%xmm7,%%xmm14 \n"
"neg %3 \n" "neg %3 \n"
"movdqa %%xmm6,%%xmm7 \n" "movdqa %%xmm6,%%xmm7 \n"
"movdqa %%xmm14,%%xmm15 \n" "movdqa %%xmm14,%%xmm15 \n"
"lea 0x10(%0,%3,8),%0 \n" "lea 0x10(%0,%3,8),%0 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
"neg %3 \n" "neg %3 \n"
// Second round of bit swap. // Second round of bit swap.
"punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n" "movdqa %%xmm1,%%xmm3 \n"
"palignr $0x8,%%xmm2,%%xmm2 \n" "palignr $0x8,%%xmm2,%%xmm2 \n"
"palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm3,%%xmm3 \n"
"punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n" "punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm4,%%xmm6 \n" "movdqa %%xmm4,%%xmm6 \n"
"movdqa %%xmm5,%%xmm7 \n" "movdqa %%xmm5,%%xmm7 \n"
"palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm6,%%xmm6 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"punpcklwd %%xmm10,%%xmm8 \n" "punpcklwd %%xmm10,%%xmm8 \n"
"punpcklwd %%xmm11,%%xmm9 \n" "punpcklwd %%xmm11,%%xmm9 \n"
"movdqa %%xmm8,%%xmm10 \n" "movdqa %%xmm8,%%xmm10 \n"
"movdqa %%xmm9,%%xmm11 \n" "movdqa %%xmm9,%%xmm11 \n"
"palignr $0x8,%%xmm10,%%xmm10 \n" "palignr $0x8,%%xmm10,%%xmm10 \n"
"palignr $0x8,%%xmm11,%%xmm11 \n" "palignr $0x8,%%xmm11,%%xmm11 \n"
"punpcklwd %%xmm14,%%xmm12 \n" "punpcklwd %%xmm14,%%xmm12 \n"
"punpcklwd %%xmm15,%%xmm13 \n" "punpcklwd %%xmm15,%%xmm13 \n"
"movdqa %%xmm12,%%xmm14 \n" "movdqa %%xmm12,%%xmm14 \n"
"movdqa %%xmm13,%%xmm15 \n" "movdqa %%xmm13,%%xmm15 \n"
"palignr $0x8,%%xmm14,%%xmm14 \n" "palignr $0x8,%%xmm14,%%xmm14 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
// Third round of bit swap. // Third round of bit swap.
// Write to the destination pointer. // Write to the destination pointer.
"punpckldq %%xmm4,%%xmm0 \n" "punpckldq %%xmm4,%%xmm0 \n"
"movq %%xmm0,(%1) \n" "movq %%xmm0,(%1) \n"
"movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm0,%%xmm4 \n"
"palignr $0x8,%%xmm4,%%xmm4 \n" "palignr $0x8,%%xmm4,%%xmm4 \n"
"movq %%xmm4,(%1,%4) \n" "movq %%xmm4,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm6,%%xmm2 \n" "punpckldq %%xmm6,%%xmm2 \n"
"movdqa %%xmm2,%%xmm6 \n" "movdqa %%xmm2,%%xmm6 \n"
"movq %%xmm2,(%1) \n" "movq %%xmm2,(%1) \n"
"palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm6,%%xmm6 \n"
"punpckldq %%xmm5,%%xmm1 \n" "punpckldq %%xmm5,%%xmm1 \n"
"movq %%xmm6,(%1,%4) \n" "movq %%xmm6,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"movdqa %%xmm1,%%xmm5 \n" "movdqa %%xmm1,%%xmm5 \n"
"movq %%xmm1,(%1) \n" "movq %%xmm1,(%1) \n"
"palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm5,%%xmm5 \n"
"movq %%xmm5,(%1,%4) \n" "movq %%xmm5,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm7,%%xmm3 \n" "punpckldq %%xmm7,%%xmm3 \n"
"movq %%xmm3,(%1) \n" "movq %%xmm3,(%1) \n"
"movdqa %%xmm3,%%xmm7 \n" "movdqa %%xmm3,%%xmm7 \n"
"palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n"
"movq %%xmm7,(%1,%4) \n" "movq %%xmm7,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm12,%%xmm8 \n" "punpckldq %%xmm12,%%xmm8 \n"
"movq %%xmm8,(%1) \n" "movq %%xmm8,(%1) \n"
"movdqa %%xmm8,%%xmm12 \n" "movdqa %%xmm8,%%xmm12 \n"
"palignr $0x8,%%xmm12,%%xmm12 \n" "palignr $0x8,%%xmm12,%%xmm12 \n"
"movq %%xmm12,(%1,%4) \n" "movq %%xmm12,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm14,%%xmm10 \n" "punpckldq %%xmm14,%%xmm10 \n"
"movdqa %%xmm10,%%xmm14 \n" "movdqa %%xmm10,%%xmm14 \n"
"movq %%xmm10,(%1) \n" "movq %%xmm10,(%1) \n"
"palignr $0x8,%%xmm14,%%xmm14 \n" "palignr $0x8,%%xmm14,%%xmm14 \n"
"punpckldq %%xmm13,%%xmm9 \n" "punpckldq %%xmm13,%%xmm9 \n"
"movq %%xmm14,(%1,%4) \n" "movq %%xmm14,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"movdqa %%xmm9,%%xmm13 \n" "movdqa %%xmm9,%%xmm13 \n"
"movq %%xmm9,(%1) \n" "movq %%xmm9,(%1) \n"
"palignr $0x8,%%xmm13,%%xmm13 \n" "palignr $0x8,%%xmm13,%%xmm13 \n"
"movq %%xmm13,(%1,%4) \n" "movq %%xmm13,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"punpckldq %%xmm15,%%xmm11 \n" "punpckldq %%xmm15,%%xmm11 \n"
"movq %%xmm11,(%1) \n" "movq %%xmm11,(%1) \n"
"movdqa %%xmm11,%%xmm15 \n" "movdqa %%xmm11,%%xmm15 \n"
"palignr $0x8,%%xmm15,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n"
"sub $0x10,%2 \n" "sub $0x10,%2 \n"
"movq %%xmm15,(%1,%4) \n" "movq %%xmm15,(%1,%4) \n"
"lea (%1,%4,2),%1 \n" "lea (%1,%4,2),%1 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
@ -266,95 +266,95 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%0,%4),%%xmm1 \n" "movdqu (%0,%4),%%xmm1 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm8 \n" "punpckhbw %%xmm1,%%xmm8 \n"
"movdqa %%xmm8,%%xmm1 \n" "movdqa %%xmm8,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqu (%0,%4),%%xmm3 \n" "movdqu (%0,%4),%%xmm3 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm2,%%xmm8 \n" "movdqa %%xmm2,%%xmm8 \n"
"punpcklbw %%xmm3,%%xmm2 \n" "punpcklbw %%xmm3,%%xmm2 \n"
"punpckhbw %%xmm3,%%xmm8 \n" "punpckhbw %%xmm3,%%xmm8 \n"
"movdqa %%xmm8,%%xmm3 \n" "movdqa %%xmm8,%%xmm3 \n"
"movdqu (%0),%%xmm4 \n" "movdqu (%0),%%xmm4 \n"
"movdqu (%0,%4),%%xmm5 \n" "movdqu (%0,%4),%%xmm5 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm4,%%xmm8 \n" "movdqa %%xmm4,%%xmm8 \n"
"punpcklbw %%xmm5,%%xmm4 \n" "punpcklbw %%xmm5,%%xmm4 \n"
"punpckhbw %%xmm5,%%xmm8 \n" "punpckhbw %%xmm5,%%xmm8 \n"
"movdqa %%xmm8,%%xmm5 \n" "movdqa %%xmm8,%%xmm5 \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu (%0,%4),%%xmm7 \n" "movdqu (%0,%4),%%xmm7 \n"
"lea (%0,%4,2),%0 \n" "lea (%0,%4,2),%0 \n"
"movdqa %%xmm6,%%xmm8 \n" "movdqa %%xmm6,%%xmm8 \n"
"punpcklbw %%xmm7,%%xmm6 \n" "punpcklbw %%xmm7,%%xmm6 \n"
"neg %4 \n" "neg %4 \n"
"lea 0x10(%0,%4,8),%0 \n" "lea 0x10(%0,%4,8),%0 \n"
"punpckhbw %%xmm7,%%xmm8 \n" "punpckhbw %%xmm7,%%xmm8 \n"
"movdqa %%xmm8,%%xmm7 \n" "movdqa %%xmm8,%%xmm7 \n"
"neg %4 \n" "neg %4 \n"
// Second round of bit swap. // Second round of bit swap.
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"movdqa %%xmm1,%%xmm9 \n" "movdqa %%xmm1,%%xmm9 \n"
"punpckhwd %%xmm2,%%xmm8 \n" "punpckhwd %%xmm2,%%xmm8 \n"
"punpckhwd %%xmm3,%%xmm9 \n" "punpckhwd %%xmm3,%%xmm9 \n"
"punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm2,%%xmm0 \n"
"punpcklwd %%xmm3,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm1 \n"
"movdqa %%xmm8,%%xmm2 \n" "movdqa %%xmm8,%%xmm2 \n"
"movdqa %%xmm9,%%xmm3 \n" "movdqa %%xmm9,%%xmm3 \n"
"movdqa %%xmm4,%%xmm8 \n" "movdqa %%xmm4,%%xmm8 \n"
"movdqa %%xmm5,%%xmm9 \n" "movdqa %%xmm5,%%xmm9 \n"
"punpckhwd %%xmm6,%%xmm8 \n" "punpckhwd %%xmm6,%%xmm8 \n"
"punpckhwd %%xmm7,%%xmm9 \n" "punpckhwd %%xmm7,%%xmm9 \n"
"punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm7,%%xmm5 \n" "punpcklwd %%xmm7,%%xmm5 \n"
"movdqa %%xmm8,%%xmm6 \n" "movdqa %%xmm8,%%xmm6 \n"
"movdqa %%xmm9,%%xmm7 \n" "movdqa %%xmm9,%%xmm7 \n"
// Third round of bit swap. // Third round of bit swap.
// Write to the destination pointer. // Write to the destination pointer.
"movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm0,%%xmm8 \n"
"punpckldq %%xmm4,%%xmm0 \n" "punpckldq %%xmm4,%%xmm0 \n"
"movlpd %%xmm0,(%1) \n" // Write back U channel "movlpd %%xmm0,(%1) \n" // Write back U channel
"movhpd %%xmm0,(%2) \n" // Write back V channel "movhpd %%xmm0,(%2) \n" // Write back V channel
"punpckhdq %%xmm4,%%xmm8 \n" "punpckhdq %%xmm4,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"movdqa %%xmm2,%%xmm8 \n" "movdqa %%xmm2,%%xmm8 \n"
"punpckldq %%xmm6,%%xmm2 \n" "punpckldq %%xmm6,%%xmm2 \n"
"movlpd %%xmm2,(%1) \n" "movlpd %%xmm2,(%1) \n"
"movhpd %%xmm2,(%2) \n" "movhpd %%xmm2,(%2) \n"
"punpckhdq %%xmm6,%%xmm8 \n" "punpckhdq %%xmm6,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"movdqa %%xmm1,%%xmm8 \n" "movdqa %%xmm1,%%xmm8 \n"
"punpckldq %%xmm5,%%xmm1 \n" "punpckldq %%xmm5,%%xmm1 \n"
"movlpd %%xmm1,(%1) \n" "movlpd %%xmm1,(%1) \n"
"movhpd %%xmm1,(%2) \n" "movhpd %%xmm1,(%2) \n"
"punpckhdq %%xmm5,%%xmm8 \n" "punpckhdq %%xmm5,%%xmm8 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"movdqa %%xmm3,%%xmm8 \n" "movdqa %%xmm3,%%xmm8 \n"
"punpckldq %%xmm7,%%xmm3 \n" "punpckldq %%xmm7,%%xmm3 \n"
"movlpd %%xmm3,(%1) \n" "movlpd %%xmm3,(%1) \n"
"movhpd %%xmm3,(%2) \n" "movhpd %%xmm3,(%2) \n"
"punpckhdq %%xmm7,%%xmm8 \n" "punpckhdq %%xmm7,%%xmm8 \n"
"sub $0x8,%3 \n" "sub $0x8,%3 \n"
"movlpd %%xmm8,(%1,%5) \n" "movlpd %%xmm8,(%1,%5) \n"
"lea (%1,%5,2),%1 \n" "lea (%1,%5,2),%1 \n"
"movhpd %%xmm8,(%2,%6) \n" "movhpd %%xmm8,(%2,%6) \n"
"lea (%2,%6,2),%2 \n" "lea (%2,%6,2),%2 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst_a), // %1 "+r"(dst_a), // %1
"+r"(dst_b), // %2 "+r"(dst_b), // %2

View File

@ -38,52 +38,52 @@ void TransposeWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"vld1.8 {d0}, [%0], %2 \n" "vld1.8 {d0}, [%0], %2 \n"
"vld1.8 {d1}, [%0], %2 \n" "vld1.8 {d1}, [%0], %2 \n"
"vld1.8 {d2}, [%0], %2 \n" "vld1.8 {d2}, [%0], %2 \n"
"vld1.8 {d3}, [%0], %2 \n" "vld1.8 {d3}, [%0], %2 \n"
"vld1.8 {d4}, [%0], %2 \n" "vld1.8 {d4}, [%0], %2 \n"
"vld1.8 {d5}, [%0], %2 \n" "vld1.8 {d5}, [%0], %2 \n"
"vld1.8 {d6}, [%0], %2 \n" "vld1.8 {d6}, [%0], %2 \n"
"vld1.8 {d7}, [%0] \n" "vld1.8 {d7}, [%0] \n"
"vtrn.8 d1, d0 \n" "vtrn.8 d1, d0 \n"
"vtrn.8 d3, d2 \n" "vtrn.8 d3, d2 \n"
"vtrn.8 d5, d4 \n" "vtrn.8 d5, d4 \n"
"vtrn.8 d7, d6 \n" "vtrn.8 d7, d6 \n"
"vtrn.16 d1, d3 \n" "vtrn.16 d1, d3 \n"
"vtrn.16 d0, d2 \n" "vtrn.16 d0, d2 \n"
"vtrn.16 d5, d7 \n" "vtrn.16 d5, d7 \n"
"vtrn.16 d4, d6 \n" "vtrn.16 d4, d6 \n"
"vtrn.32 d1, d5 \n" "vtrn.32 d1, d5 \n"
"vtrn.32 d0, d4 \n" "vtrn.32 d0, d4 \n"
"vtrn.32 d3, d7 \n" "vtrn.32 d3, d7 \n"
"vtrn.32 d2, d6 \n" "vtrn.32 d2, d6 \n"
"vrev16.8 q0, q0 \n" "vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n" "vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n" "vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n" "vrev16.8 q3, q3 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
"vst1.8 {d1}, [%0], %4 \n" "vst1.8 {d1}, [%0], %4 \n"
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
"vst1.8 {d3}, [%0], %4 \n" "vst1.8 {d3}, [%0], %4 \n"
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
"vst1.8 {d5}, [%0], %4 \n" "vst1.8 {d5}, [%0], %4 \n"
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
"vst1.8 {d7}, [%0], %4 \n" "vst1.8 {d7}, [%0], %4 \n"
"vst1.8 {d6}, [%0] \n" "vst1.8 {d6}, [%0] \n"
"add %1, #8 \n" // src += 8 "add %1, #8 \n" // src += 8
"add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
"subs %5, #8 \n" // w -= 8 "subs %5, #8 \n" // w -= 8
"bge 1b \n" "bge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
@ -208,68 +208,70 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"vld2.8 {d0, d1}, [%0], %2 \n" "vld2.8 {d0, d1}, [%0], %2 \n"
"vld2.8 {d2, d3}, [%0], %2 \n" "vld2.8 {d2, d3}, [%0], %2 \n"
"vld2.8 {d4, d5}, [%0], %2 \n" "vld2.8 {d4, d5}, [%0], %2 \n"
"vld2.8 {d6, d7}, [%0], %2 \n" "vld2.8 {d6, d7}, [%0], %2 \n"
"vld2.8 {d16, d17}, [%0], %2 \n" "vld2.8 {d16, d17}, [%0], %2 \n"
"vld2.8 {d18, d19}, [%0], %2 \n" "vld2.8 {d18, d19}, [%0], %2 \n"
"vld2.8 {d20, d21}, [%0], %2 \n" "vld2.8 {d20, d21}, [%0], %2 \n"
"vld2.8 {d22, d23}, [%0] \n" "vld2.8 {d22, d23}, [%0] \n"
"vtrn.8 q1, q0 \n" "vtrn.8 q1, q0 \n"
"vtrn.8 q3, q2 \n" "vtrn.8 q3, q2 \n"
"vtrn.8 q9, q8 \n" "vtrn.8 q9, q8 \n"
"vtrn.8 q11, q10 \n" "vtrn.8 q11, q10 \n"
"vtrn.16 q1, q3 \n" "vtrn.16 q1, q3 \n"
"vtrn.16 q0, q2 \n" "vtrn.16 q0, q2 \n"
"vtrn.16 q9, q11 \n" "vtrn.16 q9, q11 \n"
"vtrn.16 q8, q10 \n" "vtrn.16 q8, q10 \n"
"vtrn.32 q1, q9 \n" "vtrn.32 q1, q9 \n"
"vtrn.32 q0, q8 \n" "vtrn.32 q0, q8 \n"
"vtrn.32 q3, q11 \n" "vtrn.32 q3, q11 \n"
"vtrn.32 q2, q10 \n" "vtrn.32 q2, q10 \n"
"vrev16.8 q0, q0 \n" "vrev16.8 q0, q0 \n"
"vrev16.8 q1, q1 \n" "vrev16.8 q1, q1 \n"
"vrev16.8 q2, q2 \n" "vrev16.8 q2, q2 \n"
"vrev16.8 q3, q3 \n" "vrev16.8 q3, q3 \n"
"vrev16.8 q8, q8 \n" "vrev16.8 q8, q8 \n"
"vrev16.8 q9, q9 \n" "vrev16.8 q9, q9 \n"
"vrev16.8 q10, q10 \n" "vrev16.8 q10, q10 \n"
"vrev16.8 q11, q11 \n" "vrev16.8 q11, q11 \n"
"mov %0, %3 \n" "mov %0, %3 \n"
"vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n"
"vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n"
"vst1.8 {d6}, [%0], %4 \n" "vst1.8 {d6}, [%0], %4 \n"
"vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n"
"vst1.8 {d18}, [%0], %4 \n" "vst1.8 {d18}, [%0], %4 \n"
"vst1.8 {d16}, [%0], %4 \n" "vst1.8 {d16}, [%0], %4 \n"
"vst1.8 {d22}, [%0], %4 \n" "vst1.8 {d22}, [%0], %4 \n"
"vst1.8 {d20}, [%0] \n" "vst1.8 {d20}, [%0] \n"
"mov %0, %5 \n" "mov %0, %5 \n"
"vst1.8 {d3}, [%0], %6 \n" "vst1.8 {d3}, [%0], %6 \n"
"vst1.8 {d1}, [%0], %6 \n" "vst1.8 {d1}, [%0], %6 \n"
"vst1.8 {d7}, [%0], %6 \n" "vst1.8 {d7}, [%0], %6 \n"
"vst1.8 {d5}, [%0], %6 \n" "vst1.8 {d5}, [%0], %6 \n"
"vst1.8 {d19}, [%0], %6 \n" "vst1.8 {d19}, [%0], %6 \n"
"vst1.8 {d17}, [%0], %6 \n" "vst1.8 {d17}, [%0], %6 \n"
"vst1.8 {d23}, [%0], %6 \n" "vst1.8 {d23}, [%0], %6 \n"
"vst1.8 {d21}, [%0] \n" "vst1.8 {d21}, [%0] \n"
"add %1, #8*2 \n" // src += 8*2 "add %1, #8*2 \n" // src += 8*2
"add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
"add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b // dst_stride_a
"subs %7, #8 \n" // w -= 8 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
"bge 1b \n" // dst_stride_b
"subs %7, #8 \n" // w -= 8
"bge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.

View File

@ -34,74 +34,74 @@ void TransposeWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w3, %w3, #8 \n" "sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n" "ld1 {v0.8b}, [%0], %5 \n"
"ld1 {v1.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n"
"ld1 {v2.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n"
"ld1 {v3.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n"
"ld1 {v4.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n"
"ld1 {v5.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n" "ld1 {v7.8b}, [%0] \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"trn2 v16.8b, v0.8b, v1.8b \n" "trn2 v16.8b, v0.8b, v1.8b \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"trn1 v17.8b, v0.8b, v1.8b \n" "trn1 v17.8b, v0.8b, v1.8b \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v18.8b, v2.8b, v3.8b \n" "trn2 v18.8b, v2.8b, v3.8b \n"
"prfm pldl1keep, [%0, 448] \n" // row 1 "prfm pldl1keep, [%0, 448] \n" // row 1
"trn1 v19.8b, v2.8b, v3.8b \n" "trn1 v19.8b, v2.8b, v3.8b \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v20.8b, v4.8b, v5.8b \n" "trn2 v20.8b, v4.8b, v5.8b \n"
"prfm pldl1keep, [%0, 448] \n" // row 2 "prfm pldl1keep, [%0, 448] \n" // row 2
"trn1 v21.8b, v4.8b, v5.8b \n" "trn1 v21.8b, v4.8b, v5.8b \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v22.8b, v6.8b, v7.8b \n" "trn2 v22.8b, v6.8b, v7.8b \n"
"prfm pldl1keep, [%0, 448] \n" // row 3 "prfm pldl1keep, [%0, 448] \n" // row 3
"trn1 v23.8b, v6.8b, v7.8b \n" "trn1 v23.8b, v6.8b, v7.8b \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v3.4h, v17.4h, v19.4h \n" "trn2 v3.4h, v17.4h, v19.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 4 "prfm pldl1keep, [%0, 448] \n" // row 4
"trn1 v1.4h, v17.4h, v19.4h \n" "trn1 v1.4h, v17.4h, v19.4h \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v2.4h, v16.4h, v18.4h \n" "trn2 v2.4h, v16.4h, v18.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 5 "prfm pldl1keep, [%0, 448] \n" // row 5
"trn1 v0.4h, v16.4h, v18.4h \n" "trn1 v0.4h, v16.4h, v18.4h \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v7.4h, v21.4h, v23.4h \n" "trn2 v7.4h, v21.4h, v23.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 6 "prfm pldl1keep, [%0, 448] \n" // row 6
"trn1 v5.4h, v21.4h, v23.4h \n" "trn1 v5.4h, v21.4h, v23.4h \n"
"add %0, %0, %5 \n" "add %0, %0, %5 \n"
"trn2 v6.4h, v20.4h, v22.4h \n" "trn2 v6.4h, v20.4h, v22.4h \n"
"prfm pldl1keep, [%0, 448] \n" // row 7 "prfm pldl1keep, [%0, 448] \n" // row 7
"trn1 v4.4h, v20.4h, v22.4h \n" "trn1 v4.4h, v20.4h, v22.4h \n"
"trn2 v21.2s, v1.2s, v5.2s \n" "trn2 v21.2s, v1.2s, v5.2s \n"
"trn1 v17.2s, v1.2s, v5.2s \n" "trn1 v17.2s, v1.2s, v5.2s \n"
"trn2 v20.2s, v0.2s, v4.2s \n" "trn2 v20.2s, v0.2s, v4.2s \n"
"trn1 v16.2s, v0.2s, v4.2s \n" "trn1 v16.2s, v0.2s, v4.2s \n"
"trn2 v23.2s, v3.2s, v7.2s \n" "trn2 v23.2s, v3.2s, v7.2s \n"
"trn1 v19.2s, v3.2s, v7.2s \n" "trn1 v19.2s, v3.2s, v7.2s \n"
"trn2 v22.2s, v2.2s, v6.2s \n" "trn2 v22.2s, v2.2s, v6.2s \n"
"trn1 v18.2s, v2.2s, v6.2s \n" "trn1 v18.2s, v2.2s, v6.2s \n"
"mov %0, %2 \n" "mov %0, %2 \n"
"st1 {v17.8b}, [%0], %6 \n" "st1 {v17.8b}, [%0], %6 \n"
"st1 {v16.8b}, [%0], %6 \n" "st1 {v16.8b}, [%0], %6 \n"
"st1 {v19.8b}, [%0], %6 \n" "st1 {v19.8b}, [%0], %6 \n"
"st1 {v18.8b}, [%0], %6 \n" "st1 {v18.8b}, [%0], %6 \n"
"st1 {v21.8b}, [%0], %6 \n" "st1 {v21.8b}, [%0], %6 \n"
"st1 {v20.8b}, [%0], %6 \n" "st1 {v20.8b}, [%0], %6 \n"
"st1 {v23.8b}, [%0], %6 \n" "st1 {v23.8b}, [%0], %6 \n"
"st1 {v22.8b}, [%0] \n" "st1 {v22.8b}, [%0] \n"
"add %1, %1, #8 \n" // src += 8 "add %1, %1, #8 \n" // src += 8
"add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
@ -110,33 +110,33 @@ void TransposeWx8_NEON(const uint8_t* src,
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %w3, %w3, #8 \n" "adds %w3, %w3, #8 \n"
"b.eq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %w3, #2 \n" "cmp %w3, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
"cmp %w3, #4 \n" "cmp %w3, #4 \n"
"b.lt 2f \n" "b.lt 2f \n"
// 4x8 block // 4x8 block
"mov %0, %1 \n" "mov %0, %1 \n"
"ld1 {v0.s}[0], [%0], %5 \n" "ld1 {v0.s}[0], [%0], %5 \n"
"ld1 {v0.s}[1], [%0], %5 \n" "ld1 {v0.s}[1], [%0], %5 \n"
"ld1 {v0.s}[2], [%0], %5 \n" "ld1 {v0.s}[2], [%0], %5 \n"
"ld1 {v0.s}[3], [%0], %5 \n" "ld1 {v0.s}[3], [%0], %5 \n"
"ld1 {v1.s}[0], [%0], %5 \n" "ld1 {v1.s}[0], [%0], %5 \n"
"ld1 {v1.s}[1], [%0], %5 \n" "ld1 {v1.s}[1], [%0], %5 \n"
"ld1 {v1.s}[2], [%0], %5 \n" "ld1 {v1.s}[2], [%0], %5 \n"
"ld1 {v1.s}[3], [%0] \n" "ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n" "mov %0, %2 \n"
"ld1 {v2.16b}, [%4] \n" "ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n" "tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n" "tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to // TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes. // write out with 4 instead of 8 writes.
@ -228,90 +228,90 @@ void TransposeUVWx8_NEON(const uint8_t* src,
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
"sub %w4, %w4, #8 \n" "sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane // handle 8x8 blocks. this should be the majority of the plane
"1: \n" "1: \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"ld1 {v0.16b}, [%0], %5 \n" "ld1 {v0.16b}, [%0], %5 \n"
"ld1 {v1.16b}, [%0], %5 \n" "ld1 {v1.16b}, [%0], %5 \n"
"ld1 {v2.16b}, [%0], %5 \n" "ld1 {v2.16b}, [%0], %5 \n"
"ld1 {v3.16b}, [%0], %5 \n" "ld1 {v3.16b}, [%0], %5 \n"
"ld1 {v4.16b}, [%0], %5 \n" "ld1 {v4.16b}, [%0], %5 \n"
"ld1 {v5.16b}, [%0], %5 \n" "ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n" "ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n" "ld1 {v7.16b}, [%0] \n"
"mov %0, %1 \n" "mov %0, %1 \n"
"trn1 v16.16b, v0.16b, v1.16b \n" "trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n" "trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n" "trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n" "trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n" "trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n" "trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n" "trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n" "trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v0.8h, v16.8h, v18.8h \n" "trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n" "trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n" "trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n" "trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n" "trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n" "trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n" "trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n" "trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n" "trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n" "trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n" "trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n" "trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n" "trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n" "trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n" "trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n" "trn2 v23.4s, v5.4s, v7.4s \n"
"mov %0, %2 \n" "mov %0, %2 \n"
"st1 {v16.d}[0], [%0], %6 \n" "st1 {v16.d}[0], [%0], %6 \n"
"st1 {v18.d}[0], [%0], %6 \n" "st1 {v18.d}[0], [%0], %6 \n"
"st1 {v17.d}[0], [%0], %6 \n" "st1 {v17.d}[0], [%0], %6 \n"
"st1 {v19.d}[0], [%0], %6 \n" "st1 {v19.d}[0], [%0], %6 \n"
"st1 {v16.d}[1], [%0], %6 \n" "st1 {v16.d}[1], [%0], %6 \n"
"st1 {v18.d}[1], [%0], %6 \n" "st1 {v18.d}[1], [%0], %6 \n"
"st1 {v17.d}[1], [%0], %6 \n" "st1 {v17.d}[1], [%0], %6 \n"
"st1 {v19.d}[1], [%0] \n" "st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n" "mov %0, %3 \n"
"st1 {v20.d}[0], [%0], %7 \n" "st1 {v20.d}[0], [%0], %7 \n"
"st1 {v22.d}[0], [%0], %7 \n" "st1 {v22.d}[0], [%0], %7 \n"
"st1 {v21.d}[0], [%0], %7 \n" "st1 {v21.d}[0], [%0], %7 \n"
"st1 {v23.d}[0], [%0], %7 \n" "st1 {v23.d}[0], [%0], %7 \n"
"st1 {v20.d}[1], [%0], %7 \n" "st1 {v20.d}[1], [%0], %7 \n"
"st1 {v22.d}[1], [%0], %7 \n" "st1 {v22.d}[1], [%0], %7 \n"
"st1 {v21.d}[1], [%0], %7 \n" "st1 {v21.d}[1], [%0], %7 \n"
"st1 {v23.d}[1], [%0] \n" "st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2 "add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a // dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b // dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8 "subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n" "b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are // add 8 back to counter. if the result is 0 there are
// no residuals. // no residuals.
"adds %w4, %w4, #8 \n" "adds %w4, %w4, #8 \n"
"b.eq 4f \n" "b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose // some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n" "cmp %w4, #2 \n"
"b.lt 3f \n" "b.lt 3f \n"
"cmp %w4, #4 \n" "cmp %w4, #4 \n"
"b.lt 2f \n" "b.lt 2f \n"
// TODO(frkoenig): Clean this up // TODO(frkoenig): Clean this up
// 4x8 block // 4x8 block

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -490,6 +490,13 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
4, 4,
1) 1)
#endif #endif
#ifdef HAS_SCALEUVROWDOWNEVEN_NEON
SDAANY(ScaleUVRowDownEven_Any_NEON,
ScaleUVRowDownEven_NEON,
ScaleUVRowDownEven_C,
2,
3)
#endif
#ifdef SASIMDONLY #ifdef SASIMDONLY
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM // This also works and uses memcpy and SIMD instead of C, but is slower on ARM

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -299,6 +299,14 @@ static void ScaleUVDownEven(int src_width,
} }
#endif #endif
#if defined(HAS_SCALEUVROWDOWNEVEN_NEON) #if defined(HAS_SCALEUVROWDOWNEVEN_NEON)
if (TestCpuFlag(kCpuHasNEON) && !filtering) {
ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON;
if (IS_ALIGNED(dst_width, 4)) {
ScaleUVRowDownEven = ScaleUVRowDownEven_NEON;
}
}
#endif// TODO(fbarchard): Enable Box filter
#if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON
: ScaleUVRowDownEven_Any_NEON; : ScaleUVRowDownEven_Any_NEON;

View File

@ -1052,4 +1052,61 @@ TEST_FACTOR(3, 1, 3, 0)
#undef TEST_FACTOR #undef TEST_FACTOR
#undef SX #undef SX
#undef DX #undef DX
TEST_F(LibYUVScaleTest, PlaneTest3x) {
const int kSrcStride = 48;
const int kDstStride = 16;
const int kSize = kSrcStride * 3;
align_buffer_page_end(orig_pixels, kSize);
for (int i = 0; i < 48 * 3; ++i) {
orig_pixels[i] = i;
}
align_buffer_page_end(dest_pixels, kDstStride);
int iterations16 =
benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
for (int i = 0; i < iterations16; ++i) {
ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
kFilterBilinear);
}
EXPECT_EQ(49, dest_pixels[0]);
ScalePlane(orig_pixels, kSrcStride, 48, 3, dest_pixels, kDstStride, 16, 1,
kFilterNone);
EXPECT_EQ(49, dest_pixels[0]);
free_aligned_buffer_page_end(dest_pixels);
free_aligned_buffer_page_end(orig_pixels);
}
TEST_F(LibYUVScaleTest, PlaneTest4x) {
const int kSrcStride = 64;
const int kDstStride = 16;
const int kSize = kSrcStride * 4;
align_buffer_page_end(orig_pixels, kSize);
for (int i = 0; i < 64 * 4; ++i) {
orig_pixels[i] = i;
}
align_buffer_page_end(dest_pixels, kDstStride);
int iterations16 =
benchmark_width_ * benchmark_height_ / (16 * 1) * benchmark_iterations_;
for (int i = 0; i < iterations16; ++i) {
ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
kFilterBilinear);
}
EXPECT_EQ((65 + 66 + 129 + 130 + 2) / 4, dest_pixels[0]);
ScalePlane(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1,
kFilterNone);
EXPECT_EQ(130, dest_pixels[0]); // expect the 3rd pixel of the 3rd row
free_aligned_buffer_page_end(dest_pixels);
free_aligned_buffer_page_end(orig_pixels);
}
} // namespace libyuv } // namespace libyuv