mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
scale float samples and return max value
BUG=libyuv:717 TEST=ScaleSum unittest to compare C vs Arm implementation TBR=kjellander@chromium.org Change-Id: Iaa7af5547d979aad4722f868d31b405340115748 Reviewed-on: https://chromium-review.googlesource.com/600534 Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
27036e33e8
commit
8676ad7004
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1663
|
||||
Version: 1664
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -359,6 +359,11 @@ extern "C" {
|
||||
#define HAS_SOBELYROW_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on AArch64 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_SCALESUMSAMPLES_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms:
|
||||
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
|
||||
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
|
||||
@ -3152,6 +3157,14 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
|
||||
const uint8* luma,
|
||||
uint32 lumacoeff);
|
||||
|
||||
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
|
||||
float ScaleSumSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width);
|
||||
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
|
||||
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1663
|
||||
#define LIBYUV_VERSION 1664
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -26,67 +26,61 @@ extern "C" {
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
|
||||
asm volatile (
|
||||
"vmov.u16 q4, #0 \n" // accumulator
|
||||
asm volatile(
|
||||
"vmov.u16 q4, #0 \n" // accumulator
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n"
|
||||
"vld1.8 {q2, q3}, [%1]! \n"
|
||||
"veor.32 q0, q0, q2 \n"
|
||||
"veor.32 q1, q1, q3 \n"
|
||||
"vcnt.i8 q0, q0 \n"
|
||||
"vcnt.i8 q1, q1 \n"
|
||||
"subs %2, %2, #32 \n"
|
||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||
"vpadal.u8 q4, q0 \n" // 8 shorts
|
||||
"bgt 1b \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0, q1}, [%0]! \n"
|
||||
"vld1.8 {q2, q3}, [%1]! \n"
|
||||
"veor.32 q0, q0, q2 \n"
|
||||
"veor.32 q1, q1, q3 \n"
|
||||
"vcnt.i8 q0, q0 \n"
|
||||
"vcnt.i8 q1, q1 \n"
|
||||
"subs %2, %2, #32 \n"
|
||||
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
|
||||
"vpadal.u8 q4, q0 \n" // 8 shorts
|
||||
"bgt 1b \n"
|
||||
|
||||
"vpaddl.u16 q0, q4 \n" // 4 ints
|
||||
"vpadd.u32 d0, d0, d1 \n"
|
||||
"vpadd.u32 d0, d0, d0 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4");
|
||||
"vpaddl.u16 q0, q4 \n" // 4 ints
|
||||
"vpadd.u32 d0, d0, d1 \n"
|
||||
"vpadd.u32 d0, d0, d0 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
asm volatile (
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
asm volatile(
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q11, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n"
|
||||
"vld1.8 {q1}, [%1]! \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q8, d4, d4 \n"
|
||||
"vmlal.s16 q9, d6, d6 \n"
|
||||
"vmlal.s16 q10, d5, d5 \n"
|
||||
"vmlal.s16 q11, d7, d7 \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
"vadd.u32 q8, q8, q9 \n"
|
||||
"vadd.u32 q10, q10, q11 \n"
|
||||
"vadd.u32 q11, q8, q10 \n"
|
||||
"vpaddl.u32 q1, q11 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
return sse;
|
||||
}
|
||||
|
||||
|
||||
@ -24,63 +24,57 @@ extern "C" {
|
||||
// uses short accumulator which restricts count to 131 KB
|
||||
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 diff;
|
||||
asm volatile (
|
||||
"movi v4.8h, #0 \n"
|
||||
asm volatile(
|
||||
"movi v4.8h, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
"uadalp v4.8h, v0.16b \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
|
||||
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
|
||||
"eor v0.16b, v0.16b, v2.16b \n"
|
||||
"eor v1.16b, v1.16b, v3.16b \n"
|
||||
"cnt v0.16b, v0.16b \n"
|
||||
"cnt v1.16b, v1.16b \n"
|
||||
"subs %w2, %w2, #32 \n"
|
||||
"add v0.16b, v0.16b, v1.16b \n"
|
||||
"uadalp v4.8h, v0.16b \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"uaddlv s4, v4.8h \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(diff)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4");
|
||||
"uaddlv s4, v4.8h \n"
|
||||
"fmov %w3, s4 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v4");
|
||||
return diff;
|
||||
}
|
||||
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
asm volatile (
|
||||
"eor v16.16b, v16.16b, v16.16b \n"
|
||||
"eor v18.16b, v18.16b, v18.16b \n"
|
||||
"eor v17.16b, v17.16b, v17.16b \n"
|
||||
"eor v19.16b, v19.16b, v19.16b \n"
|
||||
asm volatile(
|
||||
"eor v16.16b, v16.16b, v16.16b \n"
|
||||
"eor v18.16b, v18.16b, v18.16b \n"
|
||||
"eor v17.16b, v17.16b, v17.16b \n"
|
||||
"eor v19.16b, v19.16b, v19.16b \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n"
|
||||
"ld1 {v1.16b}, [%1], #16 \n"
|
||||
"subs %w2, %w2, #16 \n"
|
||||
"usubl v2.8h, v0.8b, v1.8b \n"
|
||||
"usubl2 v3.8h, v0.16b, v1.16b \n"
|
||||
"smlal v16.4s, v2.4h, v2.4h \n"
|
||||
"smlal v17.4s, v3.4h, v3.4h \n"
|
||||
"smlal2 v18.4s, v2.8h, v2.8h \n"
|
||||
"smlal2 v19.4s, v3.8h, v3.8h \n"
|
||||
"b.gt 1b \n"
|
||||
|
||||
"add v16.4s, v16.4s, v17.4s \n"
|
||||
"add v18.4s, v18.4s, v19.4s \n"
|
||||
"add v19.4s, v16.4s, v18.4s \n"
|
||||
"addv s0, v19.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
"=r"(sse)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
"add v16.4s, v16.4s, v17.4s \n"
|
||||
"add v18.4s, v18.4s, v19.4s \n"
|
||||
"add v19.4s, v16.4s, v18.4s \n"
|
||||
"addv s0, v19.4s \n"
|
||||
"fmov %w3, s0 \n"
|
||||
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
|
||||
:
|
||||
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
|
||||
return sse;
|
||||
}
|
||||
|
||||
|
||||
@ -30,14 +30,14 @@ void TransposeWx8_NEON(const uint8* src,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8* src_temp;
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w3, %w3, #8 \n"
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w3, %w3, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
@ -92,109 +92,108 @@ void TransposeWx8_NEON(const uint8* src,
|
||||
"subs %w3, %w3, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w3, %w3, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w3, %w3, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
"cmp %w3, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
"cmp %w3, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.s}[0], [%0], %5 \n"
|
||||
"ld1 {v0.s}[1], [%0], %5 \n"
|
||||
"ld1 {v0.s}[2], [%0], %5 \n"
|
||||
"ld1 {v0.s}[3], [%0], %5 \n"
|
||||
"ld1 {v1.s}[0], [%0], %5 \n"
|
||||
"ld1 {v1.s}[1], [%0], %5 \n"
|
||||
"ld1 {v1.s}[2], [%0], %5 \n"
|
||||
"ld1 {v1.s}[3], [%0] \n"
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.s}[0], [%0], %5 \n"
|
||||
"ld1 {v0.s}[1], [%0], %5 \n"
|
||||
"ld1 {v0.s}[2], [%0], %5 \n"
|
||||
"ld1 {v0.s}[3], [%0], %5 \n"
|
||||
"ld1 {v1.s}[0], [%0], %5 \n"
|
||||
"ld1 {v1.s}[1], [%0], %5 \n"
|
||||
"ld1 {v1.s}[2], [%0], %5 \n"
|
||||
"ld1 {v1.s}[3], [%0] \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"ld1 {v2.16b}, [%4] \n"
|
||||
"ld1 {v2.16b}, [%4] \n"
|
||||
|
||||
"tbl v3.16b, {v0.16b}, v2.16b \n"
|
||||
"tbl v0.16b, {v1.16b}, v2.16b \n"
|
||||
"tbl v3.16b, {v0.16b}, v2.16b \n"
|
||||
"tbl v0.16b, {v1.16b}, v2.16b \n"
|
||||
|
||||
// TODO(frkoenig): Rework shuffle above to
|
||||
// write out with 4 instead of 8 writes.
|
||||
"st1 {v3.s}[0], [%0], %6 \n"
|
||||
"st1 {v3.s}[1], [%0], %6 \n"
|
||||
"st1 {v3.s}[2], [%0], %6 \n"
|
||||
"st1 {v3.s}[3], [%0] \n"
|
||||
// TODO(frkoenig): Rework shuffle above to
|
||||
// write out with 4 instead of 8 writes.
|
||||
"st1 {v3.s}[0], [%0], %6 \n"
|
||||
"st1 {v3.s}[1], [%0], %6 \n"
|
||||
"st1 {v3.s}[2], [%0], %6 \n"
|
||||
"st1 {v3.s}[3], [%0] \n"
|
||||
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v0.s}[0], [%0], %6 \n"
|
||||
"st1 {v0.s}[1], [%0], %6 \n"
|
||||
"st1 {v0.s}[2], [%0], %6 \n"
|
||||
"st1 {v0.s}[3], [%0] \n"
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v0.s}[0], [%0], %6 \n"
|
||||
"st1 {v0.s}[1], [%0], %6 \n"
|
||||
"st1 {v0.s}[2], [%0], %6 \n"
|
||||
"st1 {v0.s}[3], [%0] \n"
|
||||
|
||||
"add %1, %1, #4 \n" // src += 4
|
||||
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %w3, %w3, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #4 \n" // src += 4
|
||||
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %w3, %w3, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w3, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.h}[0], [%0], %5 \n"
|
||||
"ld1 {v1.h}[0], [%0], %5 \n"
|
||||
"ld1 {v0.h}[1], [%0], %5 \n"
|
||||
"ld1 {v1.h}[1], [%0], %5 \n"
|
||||
"ld1 {v0.h}[2], [%0], %5 \n"
|
||||
"ld1 {v1.h}[2], [%0], %5 \n"
|
||||
"ld1 {v0.h}[3], [%0], %5 \n"
|
||||
"ld1 {v1.h}[3], [%0] \n"
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.h}[0], [%0], %5 \n"
|
||||
"ld1 {v1.h}[0], [%0], %5 \n"
|
||||
"ld1 {v0.h}[1], [%0], %5 \n"
|
||||
"ld1 {v1.h}[1], [%0], %5 \n"
|
||||
"ld1 {v0.h}[2], [%0], %5 \n"
|
||||
"ld1 {v1.h}[2], [%0], %5 \n"
|
||||
"ld1 {v0.h}[3], [%0], %5 \n"
|
||||
"ld1 {v1.h}[3], [%0] \n"
|
||||
|
||||
"trn2 v2.8b, v0.8b, v1.8b \n"
|
||||
"trn1 v3.8b, v0.8b, v1.8b \n"
|
||||
"trn2 v2.8b, v0.8b, v1.8b \n"
|
||||
"trn1 v3.8b, v0.8b, v1.8b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v3.8b}, [%0], %6 \n"
|
||||
"st1 {v2.8b}, [%0] \n"
|
||||
"st1 {v3.8b}, [%0], %6 \n"
|
||||
"st1 {v2.8b}, [%0] \n"
|
||||
|
||||
"add %1, %1, #2 \n" // src += 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %w3, %w3, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #2 \n" // src += 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %w3, %w3, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld1 {v0.b}[0], [%1], %5 \n"
|
||||
"ld1 {v0.b}[1], [%1], %5 \n"
|
||||
"ld1 {v0.b}[2], [%1], %5 \n"
|
||||
"ld1 {v0.b}[3], [%1], %5 \n"
|
||||
"ld1 {v0.b}[4], [%1], %5 \n"
|
||||
"ld1 {v0.b}[5], [%1], %5 \n"
|
||||
"ld1 {v0.b}[6], [%1], %5 \n"
|
||||
"ld1 {v0.b}[7], [%1] \n"
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld1 {v0.b}[0], [%1], %5 \n"
|
||||
"ld1 {v0.b}[1], [%1], %5 \n"
|
||||
"ld1 {v0.b}[2], [%1], %5 \n"
|
||||
"ld1 {v0.b}[3], [%1], %5 \n"
|
||||
"ld1 {v0.b}[4], [%1], %5 \n"
|
||||
"ld1 {v0.b}[5], [%1], %5 \n"
|
||||
"ld1 {v0.b}[6], [%1], %5 \n"
|
||||
"ld1 {v0.b}[7], [%1] \n"
|
||||
|
||||
"st1 {v0.8b}, [%2] \n"
|
||||
"st1 {v0.8b}, [%2] \n"
|
||||
|
||||
"4: \n"
|
||||
"4: \n"
|
||||
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kVTbl4x4Transpose), // %4
|
||||
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
|
||||
);
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&kVTbl4x4Transpose), // %4
|
||||
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23");
|
||||
}
|
||||
|
||||
static uint8 kVTbl4x4TransposeDi[32] = {
|
||||
@ -209,212 +208,215 @@ void TransposeUVWx8_NEON(const uint8* src,
|
||||
int dst_stride_b,
|
||||
int width) {
|
||||
const uint8* src_temp;
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w4, %w4, #8 \n"
|
||||
asm volatile(
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %w4, %w4, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
|
||||
"ld1 {v0.16b}, [%0], %5 \n"
|
||||
"ld1 {v1.16b}, [%0], %5 \n"
|
||||
"ld1 {v2.16b}, [%0], %5 \n"
|
||||
"ld1 {v3.16b}, [%0], %5 \n"
|
||||
"ld1 {v4.16b}, [%0], %5 \n"
|
||||
"ld1 {v5.16b}, [%0], %5 \n"
|
||||
"ld1 {v6.16b}, [%0], %5 \n"
|
||||
"ld1 {v7.16b}, [%0] \n"
|
||||
"ld1 {v0.16b}, [%0], %5 \n"
|
||||
"ld1 {v1.16b}, [%0], %5 \n"
|
||||
"ld1 {v2.16b}, [%0], %5 \n"
|
||||
"ld1 {v3.16b}, [%0], %5 \n"
|
||||
"ld1 {v4.16b}, [%0], %5 \n"
|
||||
"ld1 {v5.16b}, [%0], %5 \n"
|
||||
"ld1 {v6.16b}, [%0], %5 \n"
|
||||
"ld1 {v7.16b}, [%0] \n"
|
||||
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
"trn1 v16.16b, v0.16b, v1.16b \n"
|
||||
"trn2 v17.16b, v0.16b, v1.16b \n"
|
||||
"trn1 v18.16b, v2.16b, v3.16b \n"
|
||||
"trn2 v19.16b, v2.16b, v3.16b \n"
|
||||
"trn1 v20.16b, v4.16b, v5.16b \n"
|
||||
"trn2 v21.16b, v4.16b, v5.16b \n"
|
||||
"trn1 v22.16b, v6.16b, v7.16b \n"
|
||||
"trn2 v23.16b, v6.16b, v7.16b \n"
|
||||
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
"trn1 v0.8h, v16.8h, v18.8h \n"
|
||||
"trn2 v1.8h, v16.8h, v18.8h \n"
|
||||
"trn1 v2.8h, v20.8h, v22.8h \n"
|
||||
"trn2 v3.8h, v20.8h, v22.8h \n"
|
||||
"trn1 v4.8h, v17.8h, v19.8h \n"
|
||||
"trn2 v5.8h, v17.8h, v19.8h \n"
|
||||
"trn1 v6.8h, v21.8h, v23.8h \n"
|
||||
"trn2 v7.8h, v21.8h, v23.8h \n"
|
||||
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
"trn1 v16.4s, v0.4s, v2.4s \n"
|
||||
"trn2 v17.4s, v0.4s, v2.4s \n"
|
||||
"trn1 v18.4s, v1.4s, v3.4s \n"
|
||||
"trn2 v19.4s, v1.4s, v3.4s \n"
|
||||
"trn1 v20.4s, v4.4s, v6.4s \n"
|
||||
"trn2 v21.4s, v4.4s, v6.4s \n"
|
||||
"trn1 v22.4s, v5.4s, v7.4s \n"
|
||||
"trn2 v23.4s, v5.4s, v7.4s \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v16.d}[0], [%0], %6 \n"
|
||||
"st1 {v18.d}[0], [%0], %6 \n"
|
||||
"st1 {v17.d}[0], [%0], %6 \n"
|
||||
"st1 {v19.d}[0], [%0], %6 \n"
|
||||
"st1 {v16.d}[1], [%0], %6 \n"
|
||||
"st1 {v18.d}[1], [%0], %6 \n"
|
||||
"st1 {v17.d}[1], [%0], %6 \n"
|
||||
"st1 {v19.d}[1], [%0] \n"
|
||||
"st1 {v16.d}[0], [%0], %6 \n"
|
||||
"st1 {v18.d}[0], [%0], %6 \n"
|
||||
"st1 {v17.d}[0], [%0], %6 \n"
|
||||
"st1 {v19.d}[0], [%0], %6 \n"
|
||||
"st1 {v16.d}[1], [%0], %6 \n"
|
||||
"st1 {v18.d}[1], [%0], %6 \n"
|
||||
"st1 {v17.d}[1], [%0], %6 \n"
|
||||
"st1 {v19.d}[1], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
"mov %0, %3 \n"
|
||||
|
||||
"st1 {v20.d}[0], [%0], %7 \n"
|
||||
"st1 {v22.d}[0], [%0], %7 \n"
|
||||
"st1 {v21.d}[0], [%0], %7 \n"
|
||||
"st1 {v23.d}[0], [%0], %7 \n"
|
||||
"st1 {v20.d}[1], [%0], %7 \n"
|
||||
"st1 {v22.d}[1], [%0], %7 \n"
|
||||
"st1 {v21.d}[1], [%0], %7 \n"
|
||||
"st1 {v23.d}[1], [%0] \n"
|
||||
"st1 {v20.d}[0], [%0], %7 \n"
|
||||
"st1 {v22.d}[0], [%0], %7 \n"
|
||||
"st1 {v21.d}[0], [%0], %7 \n"
|
||||
"st1 {v23.d}[0], [%0], %7 \n"
|
||||
"st1 {v20.d}[1], [%0], %7 \n"
|
||||
"st1 {v22.d}[1], [%0], %7 \n"
|
||||
"st1 {v21.d}[1], [%0], %7 \n"
|
||||
"st1 {v23.d}[1], [%0] \n"
|
||||
|
||||
"add %1, %1, #16 \n" // src += 8*2
|
||||
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
|
||||
"subs %w4, %w4, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
"add %1, %1, #16 \n" // src += 8*2
|
||||
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
|
||||
// dst_stride_a
|
||||
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
|
||||
// dst_stride_b
|
||||
"subs %w4, %w4, #8 \n" // w -= 8
|
||||
"b.ge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w4, %w4, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %w4, %w4, #8 \n"
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
"cmp %w4, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
"cmp %w4, #4 \n"
|
||||
"b.lt 2f \n"
|
||||
|
||||
// TODO(frkoenig): Clean this up
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
"ld1 {v1.8b}, [%0], %5 \n"
|
||||
"ld1 {v2.8b}, [%0], %5 \n"
|
||||
"ld1 {v3.8b}, [%0], %5 \n"
|
||||
"ld1 {v4.8b}, [%0], %5 \n"
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
// TODO(frkoenig): Clean this up
|
||||
// 4x8 block
|
||||
"mov %0, %1 \n"
|
||||
"ld1 {v0.8b}, [%0], %5 \n"
|
||||
"ld1 {v1.8b}, [%0], %5 \n"
|
||||
"ld1 {v2.8b}, [%0], %5 \n"
|
||||
"ld1 {v3.8b}, [%0], %5 \n"
|
||||
"ld1 {v4.8b}, [%0], %5 \n"
|
||||
"ld1 {v5.8b}, [%0], %5 \n"
|
||||
"ld1 {v6.8b}, [%0], %5 \n"
|
||||
"ld1 {v7.8b}, [%0] \n"
|
||||
|
||||
"ld1 {v30.16b}, [%8], #16 \n"
|
||||
"ld1 {v31.16b}, [%8] \n"
|
||||
"ld1 {v30.16b}, [%8], #16 \n"
|
||||
"ld1 {v31.16b}, [%8] \n"
|
||||
|
||||
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
|
||||
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
|
||||
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
|
||||
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
|
||||
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
|
||||
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
|
||||
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
|
||||
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v16.s}[0], [%0], %6 \n"
|
||||
"st1 {v16.s}[1], [%0], %6 \n"
|
||||
"st1 {v16.s}[2], [%0], %6 \n"
|
||||
"st1 {v16.s}[3], [%0], %6 \n"
|
||||
"st1 {v16.s}[0], [%0], %6 \n"
|
||||
"st1 {v16.s}[1], [%0], %6 \n"
|
||||
"st1 {v16.s}[2], [%0], %6 \n"
|
||||
"st1 {v16.s}[3], [%0], %6 \n"
|
||||
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v18.s}[0], [%0], %6 \n"
|
||||
"st1 {v18.s}[1], [%0], %6 \n"
|
||||
"st1 {v18.s}[2], [%0], %6 \n"
|
||||
"st1 {v18.s}[3], [%0] \n"
|
||||
"add %0, %2, #4 \n"
|
||||
"st1 {v18.s}[0], [%0], %6 \n"
|
||||
"st1 {v18.s}[1], [%0], %6 \n"
|
||||
"st1 {v18.s}[2], [%0], %6 \n"
|
||||
"st1 {v18.s}[3], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
"mov %0, %3 \n"
|
||||
|
||||
"st1 {v17.s}[0], [%0], %7 \n"
|
||||
"st1 {v17.s}[1], [%0], %7 \n"
|
||||
"st1 {v17.s}[2], [%0], %7 \n"
|
||||
"st1 {v17.s}[3], [%0], %7 \n"
|
||||
"st1 {v17.s}[0], [%0], %7 \n"
|
||||
"st1 {v17.s}[1], [%0], %7 \n"
|
||||
"st1 {v17.s}[2], [%0], %7 \n"
|
||||
"st1 {v17.s}[3], [%0], %7 \n"
|
||||
|
||||
"add %0, %3, #4 \n"
|
||||
"st1 {v19.s}[0], [%0], %7 \n"
|
||||
"st1 {v19.s}[1], [%0], %7 \n"
|
||||
"st1 {v19.s}[2], [%0], %7 \n"
|
||||
"st1 {v19.s}[3], [%0] \n"
|
||||
"add %0, %3, #4 \n"
|
||||
"st1 {v19.s}[0], [%0], %7 \n"
|
||||
"st1 {v19.s}[1], [%0], %7 \n"
|
||||
"st1 {v19.s}[2], [%0], %7 \n"
|
||||
"st1 {v19.s}[3], [%0] \n"
|
||||
|
||||
"add %1, %1, #8 \n" // src += 4 * 2
|
||||
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
|
||||
"subs %w4, %w4, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #8 \n" // src += 4 * 2
|
||||
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
|
||||
// dst_stride_a
|
||||
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
|
||||
// dst_stride_b
|
||||
"subs %w4, %w4, #4 \n" // w -= 4
|
||||
"b.eq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %w4, #2 \n"
|
||||
"b.lt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[3], [%0] \n"
|
||||
// 2x8 block
|
||||
"2: \n"
|
||||
"mov %0, %1 \n"
|
||||
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
|
||||
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
|
||||
"ld2 {v2.h, v3.h}[3], [%0] \n"
|
||||
|
||||
"trn1 v4.8b, v0.8b, v2.8b \n"
|
||||
"trn2 v5.8b, v0.8b, v2.8b \n"
|
||||
"trn1 v6.8b, v1.8b, v3.8b \n"
|
||||
"trn2 v7.8b, v1.8b, v3.8b \n"
|
||||
"trn1 v4.8b, v0.8b, v2.8b \n"
|
||||
"trn2 v5.8b, v0.8b, v2.8b \n"
|
||||
"trn1 v6.8b, v1.8b, v3.8b \n"
|
||||
"trn2 v7.8b, v1.8b, v3.8b \n"
|
||||
|
||||
"mov %0, %2 \n"
|
||||
"mov %0, %2 \n"
|
||||
|
||||
"st1 {v4.d}[0], [%0], %6 \n"
|
||||
"st1 {v6.d}[0], [%0] \n"
|
||||
"st1 {v4.d}[0], [%0], %6 \n"
|
||||
"st1 {v6.d}[0], [%0] \n"
|
||||
|
||||
"mov %0, %3 \n"
|
||||
"mov %0, %3 \n"
|
||||
|
||||
"st1 {v5.d}[0], [%0], %7 \n"
|
||||
"st1 {v7.d}[0], [%0] \n"
|
||||
"st1 {v5.d}[0], [%0], %7 \n"
|
||||
"st1 {v7.d}[0], [%0] \n"
|
||||
|
||||
"add %1, %1, #4 \n" // src += 2 * 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
|
||||
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
|
||||
"subs %w4, %w4, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
"add %1, %1, #4 \n" // src += 2 * 2
|
||||
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
|
||||
// dst_stride_a
|
||||
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
|
||||
// dst_stride_b
|
||||
"subs %w4, %w4, #2 \n" // w -= 2
|
||||
"b.eq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[7], [%1] \n"
|
||||
// 1x8 block
|
||||
"3: \n"
|
||||
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
|
||||
"ld2 {v0.b, v1.b}[7], [%1] \n"
|
||||
|
||||
"st1 {v0.d}[0], [%2] \n"
|
||||
"st1 {v1.d}[0], [%3] \n"
|
||||
"st1 {v0.d}[0], [%2] \n"
|
||||
"st1 {v1.d}[0], [%3] \n"
|
||||
|
||||
"4: \n"
|
||||
"4: \n"
|
||||
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst_a), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
|
||||
"r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc",
|
||||
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
|
||||
"v30", "v31"
|
||||
);
|
||||
: "=&r"(src_temp), // %0
|
||||
"+r"(src), // %1
|
||||
"+r"(dst_a), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
|
||||
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
|
||||
"r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
|
||||
}
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
|
||||
@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
|
||||
movdqa xmm7, xmm5
|
||||
lea eax, [eax + 8 * edi + 16]
|
||||
neg edi
|
||||
// Second round of bit swap.
|
||||
// Second round of bit swap.
|
||||
movdqa xmm5, xmm0
|
||||
punpcklwd xmm0, xmm2
|
||||
punpckhwd xmm5, xmm2
|
||||
@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
|
||||
punpckhwd xmm6, xmm7
|
||||
movdqa xmm7, xmm6
|
||||
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
movdqa xmm6, xmm0
|
||||
punpckldq xmm0, xmm4
|
||||
punpckhdq xmm6, xmm4
|
||||
|
||||
@ -2639,6 +2639,25 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
|
||||
}
|
||||
#endif
|
||||
|
||||
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
float fmax = 0.f;
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
float v = *src++ * scale;
|
||||
*dst++ = v;
|
||||
fmax = (v > fmax) ? v : fmax;
|
||||
}
|
||||
return fmax;
|
||||
}
|
||||
|
||||
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
float v = *src++ * scale;
|
||||
*dst++ = v;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -2612,6 +2612,53 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
|
||||
: "cc", "memory", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
float ScaleSumSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
float fmax;
|
||||
asm volatile(
|
||||
"movi v3.4s, #0 \n" // max
|
||||
"movi v4.4s, #0 \n" // max
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"fmul v1.4s, v1.4s, %4.s[0] \n" // scale
|
||||
"fmul v2.4s, v2.4s, %4.s[0] \n" // scale
|
||||
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
|
||||
"fmax v3.4s, v3.4s, v1.4s \n" // max
|
||||
"fmax v4.4s, v4.4s, v2.4s \n"
|
||||
"b.gt 1b \n"
|
||||
"fmax v3.4s, v3.4s, v4.4s \n" // max
|
||||
"fmaxv %s3, v3.4s \n" // signed max acculator
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width), // %2
|
||||
"=w"(fmax) // %3
|
||||
: "w"(scale) // %4
|
||||
: "cc", "memory", "v1", "v2", "v3", "v4");
|
||||
return fmax;
|
||||
}
|
||||
|
||||
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
|
||||
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "w"(scale) // %3
|
||||
: "cc", "memory", "v1", "v2");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
|
||||
vshufps ymm2, ymm2, ymm3, 0xdd
|
||||
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
vpmaddubsw ymm1, ymm0, ymm7 // U
|
||||
vpmaddubsw ymm3, ymm2, ymm7
|
||||
vpmaddubsw ymm0, ymm0, ymm6 // V
|
||||
@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
|
||||
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
|
||||
vpaddb ymm0, ymm0, ymm5 // -> unsigned
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
// step 3 - store 16 U and 16 V values
|
||||
vextractf128 [edx], ymm0, 0 // U
|
||||
vextractf128 [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
|
||||
vshufps ymm2, ymm2, ymm3, 0xdd
|
||||
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
vpmaddubsw ymm1, ymm0, ymm7 // U
|
||||
vpmaddubsw ymm3, ymm2, ymm7
|
||||
vpmaddubsw ymm0, ymm0, ymm6 // V
|
||||
@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
|
||||
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
|
||||
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
// step 3 - store 16 U and 16 V values
|
||||
vextractf128 [edx], ymm0, 0 // U
|
||||
vextractf128 [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
|
||||
psrlw xmm0, 6
|
||||
packuswb xmm0, xmm0 // G
|
||||
|
||||
// Step 2: Weave into ARGB
|
||||
// Step 2: Weave into ARGB
|
||||
punpcklbw xmm0, xmm0 // GG
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
|
||||
@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
|
||||
vpsrlw ymm0, ymm0, 6
|
||||
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
|
||||
|
||||
// TODO(fbarchard): Weave alpha with unpack.
|
||||
// Step 2: Weave into ARGB
|
||||
// TODO(fbarchard): Weave alpha with unpack.
|
||||
// Step 2: Weave into ARGB
|
||||
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
|
||||
vpermq ymm1, ymm1, 0xd8
|
||||
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
|
||||
@ -4067,7 +4067,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
|
||||
sub edx, esi
|
||||
sub edi, esi
|
||||
|
||||
// 8 pixel loop.
|
||||
// 8 pixel loop.
|
||||
convertloop8:
|
||||
movq xmm0, qword ptr [esi] // alpha
|
||||
punpcklbw xmm0, xmm0
|
||||
@ -4123,7 +4123,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
|
||||
sub edx, esi
|
||||
sub edi, esi
|
||||
|
||||
// 32 pixel loop.
|
||||
// 32 pixel loop.
|
||||
convertloop32:
|
||||
vmovdqu ymm0, [esi] // alpha
|
||||
vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
|
||||
@ -4183,7 +4183,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
|
||||
sub ecx, 4
|
||||
jl convertloop4b // less than 4 pixels?
|
||||
|
||||
// 4 pixel loop.
|
||||
// 4 pixel loop.
|
||||
convertloop4:
|
||||
movdqu xmm3, [eax] // src argb
|
||||
lea eax, [eax + 16]
|
||||
@ -4212,7 +4212,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
|
||||
add ecx, 4 - 1
|
||||
jl convertloop1b
|
||||
|
||||
// 1 pixel loop.
|
||||
// 1 pixel loop.
|
||||
convertloop1:
|
||||
movd xmm3, [eax] // src argb
|
||||
lea eax, [eax + 4]
|
||||
@ -5256,7 +5256,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
|
||||
cvtps2dq xmm5, xmm5 // 0.16 fixed point
|
||||
packssdw xmm5, xmm5 // 16 bit shorts
|
||||
|
||||
// 4 pixel loop small blocks.
|
||||
// 4 pixel loop small blocks.
|
||||
s4:
|
||||
// top left
|
||||
movdqu xmm0, [eax]
|
||||
@ -5298,7 +5298,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
|
||||
|
||||
jmp l4b
|
||||
|
||||
// 4 pixel loop
|
||||
// 4 pixel loop
|
||||
l4:
|
||||
// top left
|
||||
movdqu xmm0, [eax]
|
||||
@ -5350,7 +5350,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
|
||||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
movdqu xmm0, [eax]
|
||||
psubd xmm0, [eax + edx * 4]
|
||||
@ -5392,7 +5392,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
|
||||
test edx, 15
|
||||
jne l4b
|
||||
|
||||
// 4 pixel loop
|
||||
// 4 pixel loop
|
||||
l4:
|
||||
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
|
||||
lea eax, [eax + 16]
|
||||
@ -5438,7 +5438,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
|
||||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
|
||||
lea eax, [eax + 4]
|
||||
@ -5481,7 +5481,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
|
||||
sub ecx, 4
|
||||
jl l4b
|
||||
|
||||
// setup for 4 pixel loop
|
||||
// setup for 4 pixel loop
|
||||
pshufd xmm7, xmm7, 0x44 // dup dudv
|
||||
pshufd xmm5, xmm5, 0 // dup 4, stride
|
||||
movdqa xmm0, xmm2 // x0, y0, x1, y1
|
||||
@ -5493,7 +5493,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
|
||||
addps xmm3, xmm4
|
||||
addps xmm4, xmm4 // dudv *= 4
|
||||
|
||||
// 4 pixel loop
|
||||
// 4 pixel loop
|
||||
l4:
|
||||
cvttps2dq xmm0, xmm2 // x, y float to int first 2
|
||||
cvttps2dq xmm1, xmm3 // x, y float to int next 2
|
||||
@ -5524,7 +5524,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
|
||||
add ecx, 4 - 1
|
||||
jl l1b
|
||||
|
||||
// 1 pixel loop
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
cvttps2dq xmm0, xmm2 // x, y float to int
|
||||
packssdw xmm0, xmm0 // x, y as shorts
|
||||
@ -5598,7 +5598,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
|
||||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
// Blend 50 / 50.
|
||||
// Blend 50 / 50.
|
||||
xloop50:
|
||||
vmovdqu ymm0, [esi]
|
||||
vpavgb ymm0, ymm0, [esi + edx]
|
||||
@ -5608,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
|
||||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
xloop100:
|
||||
rep movsb
|
||||
|
||||
@ -5638,7 +5638,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
|
||||
mov ecx, [esp + 8 + 16] // dst_width
|
||||
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
||||
sub edi, esi
|
||||
// Dispatch to specialized filters if applicable.
|
||||
// Dispatch to specialized filters if applicable.
|
||||
cmp eax, 0
|
||||
je xloop100 // 0 /256. Blend 100 / 0.
|
||||
cmp eax, 128
|
||||
@ -5678,7 +5678,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
|
||||
jg xloop
|
||||
jmp xloop99
|
||||
|
||||
// Blend 50 / 50.
|
||||
// Blend 50 / 50.
|
||||
xloop50:
|
||||
movdqu xmm0, [esi]
|
||||
movdqu xmm1, [esi + edx]
|
||||
@ -5689,7 +5689,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
|
||||
jg xloop50
|
||||
jmp xloop99
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
xloop100:
|
||||
movdqu xmm0, [esi]
|
||||
movdqu [esi + edi], xmm0
|
||||
@ -5784,7 +5784,7 @@ __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
|
||||
cmp ebx, 0x02010003
|
||||
je shuf_2103
|
||||
|
||||
// TODO(fbarchard): Use one source pointer and 3 offsets.
|
||||
// TODO(fbarchard): Use one source pointer and 3 offsets.
|
||||
shuf_any1:
|
||||
movzx ebx, byte ptr [esi]
|
||||
movzx ebx, byte ptr [eax + ebx]
|
||||
@ -5971,7 +5971,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
|
||||
mov ecx, [esp + 4 + 16] /* width */
|
||||
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
||||
|
||||
// 2 pixel loop.
|
||||
// 2 pixel loop.
|
||||
convertloop:
|
||||
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
|
||||
@ -6072,7 +6072,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
|
||||
pxor xmm5, xmm5
|
||||
sub edx, eax
|
||||
|
||||
// 8 pixel loop.
|
||||
// 8 pixel loop.
|
||||
convertloop:
|
||||
movdqu xmm2, xmmword ptr [eax] // 8 shorts
|
||||
add eax, 16
|
||||
@ -6110,7 +6110,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
|
||||
vpxor ymm5, ymm5, ymm5
|
||||
sub edx, eax
|
||||
|
||||
// 16 pixel loop.
|
||||
// 16 pixel loop.
|
||||
convertloop:
|
||||
vmovdqu ymm2, [eax] // 16 shorts
|
||||
add eax, 32
|
||||
@ -6144,7 +6144,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
|
||||
mov ecx, [esp + 16] /* width */
|
||||
sub edx, eax
|
||||
|
||||
// 16 pixel loop.
|
||||
// 16 pixel loop.
|
||||
convertloop:
|
||||
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
||||
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
||||
@ -6252,7 +6252,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
|
||||
psllw xmm4, 8
|
||||
pxor xmm5, xmm5
|
||||
|
||||
// 4 pixel loop.
|
||||
// 4 pixel loop.
|
||||
convertloop:
|
||||
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
|
||||
pmaddubsw xmm0, xmm3
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
|
||||
mov ecx, [esp + 12] // src_width
|
||||
pxor xmm5, xmm5
|
||||
|
||||
// sum rows
|
||||
// sum rows
|
||||
xloop:
|
||||
movdqu xmm3, [eax] // read 16 bytes
|
||||
lea eax, [eax + 16]
|
||||
@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
|
||||
mov ecx, [esp + 12] // src_width
|
||||
vpxor ymm5, ymm5, ymm5
|
||||
|
||||
// sum rows
|
||||
// sum rows
|
||||
xloop:
|
||||
vmovdqu ymm3, [eax] // read 32 bytes
|
||||
lea eax, [eax + 32]
|
||||
@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
|
||||
add ecx, 2 - 1
|
||||
jl xloop99
|
||||
|
||||
// 1 pixel remainder
|
||||
// 1 pixel remainder
|
||||
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
|
||||
movd xmm0, ebx
|
||||
psrlw xmm2, 9 // 7 bit fractions.
|
||||
@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
|
||||
sub ecx, 4
|
||||
jl xloop49
|
||||
|
||||
// 4 Pixel loop.
|
||||
// 4 Pixel loop.
|
||||
xloop4:
|
||||
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
||||
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
||||
@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
|
||||
test ecx, 2
|
||||
je xloop29
|
||||
|
||||
// 2 Pixels.
|
||||
// 2 Pixels.
|
||||
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
|
||||
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
|
||||
pextrw eax, xmm2, 5 // get x2 integer.
|
||||
@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
|
||||
test ecx, 1
|
||||
je xloop99
|
||||
|
||||
// 1 Pixels.
|
||||
// 1 Pixels.
|
||||
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
|
||||
movd dword ptr [edi], xmm0
|
||||
xloop99:
|
||||
@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
|
||||
add ecx, 2 - 1
|
||||
jl xloop99
|
||||
|
||||
// 1 pixel remainder
|
||||
// 1 pixel remainder
|
||||
psrlw xmm2, 9 // 7 bit fractions.
|
||||
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
|
||||
pshufb xmm2, xmm5 // 00000000
|
||||
|
||||
@ -11,6 +11,9 @@
|
||||
#include <stdlib.h>
|
||||
#include <time.h>
|
||||
|
||||
// row.h defines SIMD_ALIGNED, overriding unit_test.h
|
||||
#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
|
||||
|
||||
#include "../unit_test/unit_test.h"
|
||||
#include "libyuv/compare.h"
|
||||
#include "libyuv/convert.h"
|
||||
@ -2518,4 +2521,146 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
}
|
||||
|
||||
float TestScaleSumSamples(int benchmark_width,
|
||||
int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
float scale,
|
||||
bool opt) {
|
||||
int i, j;
|
||||
float max_c, max_opt;
|
||||
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
||||
|
||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
||||
uint8* dst_opt = orig_y + y_plane_size;
|
||||
uint8* dst_c = orig_y + y_plane_size * 2;
|
||||
|
||||
// Randomize works but may contain some denormals affecting performance.
|
||||
// MemRandomize(orig_y, y_plane_size);
|
||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
||||
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
|
||||
}
|
||||
memset(dst_c, 0, y_plane_size);
|
||||
memset(dst_opt, 1, y_plane_size);
|
||||
|
||||
// Disable all optimizations.
|
||||
max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_c), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
|
||||
// Enable optimizations.
|
||||
for (j = 0; j < benchmark_iterations; j++) {
|
||||
#ifdef HAS_SCALESUMSAMPLES_NEON
|
||||
if (opt) {
|
||||
max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_opt), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
|
||||
} else {
|
||||
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_opt), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
}
|
||||
#else
|
||||
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_opt), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
#endif
|
||||
}
|
||||
|
||||
float max_diff = 0;
|
||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||
if (abs_diff > max_diff) {
|
||||
max_diff = abs_diff;
|
||||
}
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(orig_y);
|
||||
return max_diff;
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
|
||||
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, 1.2f, false);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
|
||||
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, 1.2f, true);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
float TestScaleSamples(int benchmark_width,
|
||||
int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
float scale,
|
||||
bool opt) {
|
||||
int i, j;
|
||||
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
||||
|
||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
||||
uint8* dst_opt = orig_y + y_plane_size;
|
||||
uint8* dst_c = orig_y + y_plane_size * 2;
|
||||
|
||||
// Randomize works but may contain some denormals affecting performance.
|
||||
// MemRandomize(orig_y, y_plane_size);
|
||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
||||
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
|
||||
}
|
||||
|
||||
memset(dst_c, 0, y_plane_size);
|
||||
memset(dst_opt, 1, y_plane_size);
|
||||
|
||||
// Disable all optimizations.
|
||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_c), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
|
||||
// Enable optimizations.
|
||||
for (j = 0; j < benchmark_iterations; j++) {
|
||||
#ifdef HAS_SCALESAMPLES_NEON
|
||||
if (opt) {
|
||||
max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_opt), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
|
||||
} else {
|
||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_opt), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
}
|
||||
#else
|
||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||
reinterpret_cast<float*>(dst_opt), scale,
|
||||
benchmark_width * benchmark_height);
|
||||
#endif
|
||||
}
|
||||
|
||||
float max_diff = 0;
|
||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||
if (abs_diff > max_diff) {
|
||||
max_diff = abs_diff;
|
||||
}
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(orig_y);
|
||||
return max_diff;
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
|
||||
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, 1.2f, false);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
|
||||
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, 1.2f, true);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
@ -36,6 +36,9 @@ static __inline int Abs(int v) {
|
||||
return v >= 0 ? v : -v;
|
||||
}
|
||||
|
||||
static __inline float FAbs(float v) {
|
||||
return v >= 0 ? v : -v;
|
||||
}
|
||||
#define OFFBY 0
|
||||
|
||||
// Scaling uses 16.16 fixed point to step thru the source image, so a
|
||||
@ -70,8 +73,11 @@ static inline bool SizeValid(int src_width,
|
||||
uint8* var; \
|
||||
uint8* var##_mem; \
|
||||
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
|
||||
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & /* NOLINT */ \
|
||||
~4095) - (size)) & ~63);
|
||||
var = (uint8*)((intptr_t)(var##_mem + \
|
||||
(((size) + 4095 + 63) & /* NOLINT */ \
|
||||
~4095) - \
|
||||
(size)) & \
|
||||
~63);
|
||||
|
||||
#define free_aligned_buffer_page_end(var) \
|
||||
free(var##_mem); \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user