scale float samples and return max value

BUG=libyuv:717
TEST=ScaleSum unittest to compare C vs Arm implementation
TBR=kjellander@chromium.org

Change-Id: Iaa7af5547d979aad4722f868d31b405340115748
Reviewed-on: https://chromium-review.googlesource.com/600534
Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
Frank Barchard 2017-08-04 16:05:25 -07:00
parent 27036e33e8
commit 8676ad7004
14 changed files with 1245 additions and 1031 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1663
Version: 1664
License: BSD
License File: LICENSE

View File

@ -359,6 +359,11 @@ extern "C" {
#define HAS_SOBELYROW_NEON
#endif
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_SCALESUMSAMPLES_NEON
#endif
// The following are available on Mips platforms:
#if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
(_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
@ -3152,6 +3157,14 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
const uint8* luma,
uint32 lumacoeff);
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width);
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1663
#define LIBYUV_VERSION 1664
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -26,67 +26,61 @@ extern "C" {
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"vmov.u16 q4, #0 \n" // accumulator
asm volatile(
"vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
"vld1.8 {q2, q3}, [%1]! \n"
"veor.32 q0, q0, q2 \n"
"veor.32 q1, q1, q3 \n"
"vcnt.i8 q0, q0 \n"
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
"vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
"vpaddl.u16 q0, q4 \n" // 4 ints
"vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "q0", "q1", "q2", "q3", "q4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
asm volatile(
"vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n"
"vmov.u8 q11, #0 \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"1: \n"
"vld1.8 {q0}, [%0]! \n"
"vld1.8 {q1}, [%1]! \n"
"subs %2, %2, #16 \n"
"vsubl.u8 q2, d0, d2 \n"
"vsubl.u8 q3, d1, d3 \n"
"vmlal.s16 q8, d4, d4 \n"
"vmlal.s16 q9, d6, d6 \n"
"vmlal.s16 q10, d5, d5 \n"
"vmlal.s16 q11, d7, d7 \n"
"bgt 1b \n"
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
"vadd.u32 q8, q8, q9 \n"
"vadd.u32 q10, q10, q11 \n"
"vadd.u32 q11, q8, q10 \n"
"vpaddl.u32 q1, q11 \n"
"vadd.u64 d0, d2, d3 \n"
"vmov.32 %3, d0[0] \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
return sse;
}

View File

@ -24,63 +24,57 @@ extern "C" {
// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
"movi v4.8h, #0 \n"
asm volatile(
"movi v4.8h, #0 \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
"ld1 {v2.16b, v3.16b}, [%1], #32 \n"
"eor v0.16b, v0.16b, v2.16b \n"
"eor v1.16b, v1.16b, v3.16b \n"
"cnt v0.16b, v0.16b \n"
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
"uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
"uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
:
: "cc", "v0", "v1", "v2", "v3", "v4");
return diff;
}
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 sse;
asm volatile (
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
asm volatile(
"eor v16.16b, v16.16b, v16.16b \n"
"eor v18.16b, v18.16b, v18.16b \n"
"eor v17.16b, v17.16b, v17.16b \n"
"eor v19.16b, v19.16b, v19.16b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"1: \n"
"ld1 {v0.16b}, [%0], #16 \n"
"ld1 {v1.16b}, [%1], #16 \n"
"subs %w2, %w2, #16 \n"
"usubl v2.8h, v0.8b, v1.8b \n"
"usubl2 v3.8h, v0.16b, v1.16b \n"
"smlal v16.4s, v2.4h, v2.4h \n"
"smlal v17.4s, v3.4h, v3.4h \n"
"smlal2 v18.4s, v2.8h, v2.8h \n"
"smlal2 v19.4s, v3.8h, v3.8h \n"
"b.gt 1b \n"
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a),
"+r"(src_b),
"+r"(count),
"=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
"add v16.4s, v16.4s, v17.4s \n"
"add v18.4s, v18.4s, v19.4s \n"
"add v19.4s, v16.4s, v18.4s \n"
"addv s0, v19.4s \n"
"fmov %w3, s0 \n"
: "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
:
: "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
return sse;
}

View File

@ -30,14 +30,14 @@ void TransposeWx8_NEON(const uint8* src,
int dst_stride,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w3, %w3, #8 \n"
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w3, %w3, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
@ -92,109 +92,108 @@ void TransposeWx8_NEON(const uint8* src,
"subs %w3, %w3, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w3, %w3, #8 \n"
"b.eq 4f \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w3, %w3, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w3, #2 \n"
"b.lt 3f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w3, #2 \n"
"b.lt 3f \n"
"cmp %w3, #4 \n"
"b.lt 2f \n"
"cmp %w3, #4 \n"
"b.lt 2f \n"
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.s}[0], [%0], %5 \n"
"ld1 {v0.s}[1], [%0], %5 \n"
"ld1 {v0.s}[2], [%0], %5 \n"
"ld1 {v0.s}[3], [%0], %5 \n"
"ld1 {v1.s}[0], [%0], %5 \n"
"ld1 {v1.s}[1], [%0], %5 \n"
"ld1 {v1.s}[2], [%0], %5 \n"
"ld1 {v1.s}[3], [%0] \n"
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.s}[0], [%0], %5 \n"
"ld1 {v0.s}[1], [%0], %5 \n"
"ld1 {v0.s}[2], [%0], %5 \n"
"ld1 {v0.s}[3], [%0], %5 \n"
"ld1 {v1.s}[0], [%0], %5 \n"
"ld1 {v1.s}[1], [%0], %5 \n"
"ld1 {v1.s}[2], [%0], %5 \n"
"ld1 {v1.s}[3], [%0] \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"ld1 {v2.16b}, [%4] \n"
"ld1 {v2.16b}, [%4] \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
"tbl v3.16b, {v0.16b}, v2.16b \n"
"tbl v0.16b, {v1.16b}, v2.16b \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
"st1 {v3.s}[0], [%0], %6 \n"
"st1 {v3.s}[1], [%0], %6 \n"
"st1 {v3.s}[2], [%0], %6 \n"
"st1 {v3.s}[3], [%0] \n"
// TODO(frkoenig): Rework shuffle above to
// write out with 4 instead of 8 writes.
"st1 {v3.s}[0], [%0], %6 \n"
"st1 {v3.s}[1], [%0], %6 \n"
"st1 {v3.s}[2], [%0], %6 \n"
"st1 {v3.s}[3], [%0] \n"
"add %0, %2, #4 \n"
"st1 {v0.s}[0], [%0], %6 \n"
"st1 {v0.s}[1], [%0], %6 \n"
"st1 {v0.s}[2], [%0], %6 \n"
"st1 {v0.s}[3], [%0] \n"
"add %0, %2, #4 \n"
"st1 {v0.s}[0], [%0], %6 \n"
"st1 {v0.s}[1], [%0], %6 \n"
"st1 {v0.s}[2], [%0], %6 \n"
"st1 {v0.s}[3], [%0] \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n"
"add %1, %1, #4 \n" // src += 4
"add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
"subs %w3, %w3, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w3, #2 \n"
"b.lt 3f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w3, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld1 {v0.h}[0], [%0], %5 \n"
"ld1 {v1.h}[0], [%0], %5 \n"
"ld1 {v0.h}[1], [%0], %5 \n"
"ld1 {v1.h}[1], [%0], %5 \n"
"ld1 {v0.h}[2], [%0], %5 \n"
"ld1 {v1.h}[2], [%0], %5 \n"
"ld1 {v0.h}[3], [%0], %5 \n"
"ld1 {v1.h}[3], [%0] \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld1 {v0.h}[0], [%0], %5 \n"
"ld1 {v1.h}[0], [%0], %5 \n"
"ld1 {v0.h}[1], [%0], %5 \n"
"ld1 {v1.h}[1], [%0], %5 \n"
"ld1 {v0.h}[2], [%0], %5 \n"
"ld1 {v1.h}[2], [%0], %5 \n"
"ld1 {v0.h}[3], [%0], %5 \n"
"ld1 {v1.h}[3], [%0] \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"trn2 v2.8b, v0.8b, v1.8b \n"
"trn1 v3.8b, v0.8b, v1.8b \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v3.8b}, [%0], %6 \n"
"st1 {v2.8b}, [%0] \n"
"st1 {v3.8b}, [%0], %6 \n"
"st1 {v2.8b}, [%0] \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n"
"add %1, %1, #2 \n" // src += 2
"add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
"subs %w3, %w3, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
"ld1 {v0.b}[0], [%1], %5 \n"
"ld1 {v0.b}[1], [%1], %5 \n"
"ld1 {v0.b}[2], [%1], %5 \n"
"ld1 {v0.b}[3], [%1], %5 \n"
"ld1 {v0.b}[4], [%1], %5 \n"
"ld1 {v0.b}[5], [%1], %5 \n"
"ld1 {v0.b}[6], [%1], %5 \n"
"ld1 {v0.b}[7], [%1] \n"
// 1x8 block
"3: \n"
"ld1 {v0.b}[0], [%1], %5 \n"
"ld1 {v0.b}[1], [%1], %5 \n"
"ld1 {v0.b}[2], [%1], %5 \n"
"ld1 {v0.b}[3], [%1], %5 \n"
"ld1 {v0.b}[4], [%1], %5 \n"
"ld1 {v0.b}[5], [%1], %5 \n"
"ld1 {v0.b}[6], [%1], %5 \n"
"ld1 {v0.b}[7], [%1] \n"
"st1 {v0.8b}, [%2] \n"
"st1 {v0.8b}, [%2] \n"
"4: \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23"
);
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst), // %2
"+r"(width) // %3
: "r"(&kVTbl4x4Transpose), // %4
"r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride)) // %6
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
static uint8 kVTbl4x4TransposeDi[32] = {
@ -209,212 +208,215 @@ void TransposeUVWx8_NEON(const uint8* src,
int dst_stride_b,
int width) {
const uint8* src_temp;
asm volatile (
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w4, %w4, #8 \n"
asm volatile(
// loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter
// at w-8 allow for this
"sub %w4, %w4, #8 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
// handle 8x8 blocks. this should be the majority of the plane
"1: \n"
"mov %0, %1 \n"
"ld1 {v0.16b}, [%0], %5 \n"
"ld1 {v1.16b}, [%0], %5 \n"
"ld1 {v2.16b}, [%0], %5 \n"
"ld1 {v3.16b}, [%0], %5 \n"
"ld1 {v4.16b}, [%0], %5 \n"
"ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n"
"ld1 {v0.16b}, [%0], %5 \n"
"ld1 {v1.16b}, [%0], %5 \n"
"ld1 {v2.16b}, [%0], %5 \n"
"ld1 {v3.16b}, [%0], %5 \n"
"ld1 {v4.16b}, [%0], %5 \n"
"ld1 {v5.16b}, [%0], %5 \n"
"ld1 {v6.16b}, [%0], %5 \n"
"ld1 {v7.16b}, [%0] \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v16.16b, v0.16b, v1.16b \n"
"trn2 v17.16b, v0.16b, v1.16b \n"
"trn1 v18.16b, v2.16b, v3.16b \n"
"trn2 v19.16b, v2.16b, v3.16b \n"
"trn1 v20.16b, v4.16b, v5.16b \n"
"trn2 v21.16b, v4.16b, v5.16b \n"
"trn1 v22.16b, v6.16b, v7.16b \n"
"trn2 v23.16b, v6.16b, v7.16b \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v0.8h, v16.8h, v18.8h \n"
"trn2 v1.8h, v16.8h, v18.8h \n"
"trn1 v2.8h, v20.8h, v22.8h \n"
"trn2 v3.8h, v20.8h, v22.8h \n"
"trn1 v4.8h, v17.8h, v19.8h \n"
"trn2 v5.8h, v17.8h, v19.8h \n"
"trn1 v6.8h, v21.8h, v23.8h \n"
"trn2 v7.8h, v21.8h, v23.8h \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"trn1 v16.4s, v0.4s, v2.4s \n"
"trn2 v17.4s, v0.4s, v2.4s \n"
"trn1 v18.4s, v1.4s, v3.4s \n"
"trn2 v19.4s, v1.4s, v3.4s \n"
"trn1 v20.4s, v4.4s, v6.4s \n"
"trn2 v21.4s, v4.4s, v6.4s \n"
"trn1 v22.4s, v5.4s, v7.4s \n"
"trn2 v23.4s, v5.4s, v7.4s \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v16.d}[0], [%0], %6 \n"
"st1 {v18.d}[0], [%0], %6 \n"
"st1 {v17.d}[0], [%0], %6 \n"
"st1 {v19.d}[0], [%0], %6 \n"
"st1 {v16.d}[1], [%0], %6 \n"
"st1 {v18.d}[1], [%0], %6 \n"
"st1 {v17.d}[1], [%0], %6 \n"
"st1 {v19.d}[1], [%0] \n"
"st1 {v16.d}[0], [%0], %6 \n"
"st1 {v18.d}[0], [%0], %6 \n"
"st1 {v17.d}[0], [%0], %6 \n"
"st1 {v19.d}[0], [%0], %6 \n"
"st1 {v16.d}[1], [%0], %6 \n"
"st1 {v18.d}[1], [%0], %6 \n"
"st1 {v17.d}[1], [%0], %6 \n"
"st1 {v19.d}[1], [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
"st1 {v20.d}[0], [%0], %7 \n"
"st1 {v22.d}[0], [%0], %7 \n"
"st1 {v21.d}[0], [%0], %7 \n"
"st1 {v23.d}[0], [%0], %7 \n"
"st1 {v20.d}[1], [%0], %7 \n"
"st1 {v22.d}[1], [%0], %7 \n"
"st1 {v21.d}[1], [%0], %7 \n"
"st1 {v23.d}[1], [%0] \n"
"st1 {v20.d}[0], [%0], %7 \n"
"st1 {v22.d}[0], [%0], %7 \n"
"st1 {v21.d}[0], [%0], %7 \n"
"st1 {v23.d}[0], [%0], %7 \n"
"st1 {v20.d}[1], [%0], %7 \n"
"st1 {v22.d}[1], [%0], %7 \n"
"st1 {v21.d}[1], [%0], %7 \n"
"st1 {v23.d}[1], [%0] \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n"
"add %1, %1, #16 \n" // src += 8*2
"add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
// dst_stride_a
"add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
// dst_stride_b
"subs %w4, %w4, #8 \n" // w -= 8
"b.ge 1b \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w4, %w4, #8 \n"
"b.eq 4f \n"
// add 8 back to counter. if the result is 0 there are
// no residuals.
"adds %w4, %w4, #8 \n"
"b.eq 4f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n"
"b.lt 3f \n"
// some residual, so between 1 and 7 lines left to transpose
"cmp %w4, #2 \n"
"b.lt 3f \n"
"cmp %w4, #4 \n"
"b.lt 2f \n"
"cmp %w4, #4 \n"
"b.lt 2f \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
"ld1 {v1.8b}, [%0], %5 \n"
"ld1 {v2.8b}, [%0], %5 \n"
"ld1 {v3.8b}, [%0], %5 \n"
"ld1 {v4.8b}, [%0], %5 \n"
"ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n"
// TODO(frkoenig): Clean this up
// 4x8 block
"mov %0, %1 \n"
"ld1 {v0.8b}, [%0], %5 \n"
"ld1 {v1.8b}, [%0], %5 \n"
"ld1 {v2.8b}, [%0], %5 \n"
"ld1 {v3.8b}, [%0], %5 \n"
"ld1 {v4.8b}, [%0], %5 \n"
"ld1 {v5.8b}, [%0], %5 \n"
"ld1 {v6.8b}, [%0], %5 \n"
"ld1 {v7.8b}, [%0] \n"
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
"ld1 {v30.16b}, [%8], #16 \n"
"ld1 {v31.16b}, [%8] \n"
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
"tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
"tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
"tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
"tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v16.s}[0], [%0], %6 \n"
"st1 {v16.s}[1], [%0], %6 \n"
"st1 {v16.s}[2], [%0], %6 \n"
"st1 {v16.s}[3], [%0], %6 \n"
"st1 {v16.s}[0], [%0], %6 \n"
"st1 {v16.s}[1], [%0], %6 \n"
"st1 {v16.s}[2], [%0], %6 \n"
"st1 {v16.s}[3], [%0], %6 \n"
"add %0, %2, #4 \n"
"st1 {v18.s}[0], [%0], %6 \n"
"st1 {v18.s}[1], [%0], %6 \n"
"st1 {v18.s}[2], [%0], %6 \n"
"st1 {v18.s}[3], [%0] \n"
"add %0, %2, #4 \n"
"st1 {v18.s}[0], [%0], %6 \n"
"st1 {v18.s}[1], [%0], %6 \n"
"st1 {v18.s}[2], [%0], %6 \n"
"st1 {v18.s}[3], [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
"st1 {v17.s}[0], [%0], %7 \n"
"st1 {v17.s}[1], [%0], %7 \n"
"st1 {v17.s}[2], [%0], %7 \n"
"st1 {v17.s}[3], [%0], %7 \n"
"st1 {v17.s}[0], [%0], %7 \n"
"st1 {v17.s}[1], [%0], %7 \n"
"st1 {v17.s}[2], [%0], %7 \n"
"st1 {v17.s}[3], [%0], %7 \n"
"add %0, %3, #4 \n"
"st1 {v19.s}[0], [%0], %7 \n"
"st1 {v19.s}[1], [%0], %7 \n"
"st1 {v19.s}[2], [%0], %7 \n"
"st1 {v19.s}[3], [%0] \n"
"add %0, %3, #4 \n"
"st1 {v19.s}[0], [%0], %7 \n"
"st1 {v19.s}[1], [%0], %7 \n"
"st1 {v19.s}[2], [%0], %7 \n"
"st1 {v19.s}[3], [%0] \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n"
"add %1, %1, #8 \n" // src += 4 * 2
"add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
// dst_stride_a
"add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
// dst_stride_b
"subs %w4, %w4, #4 \n" // w -= 4
"b.eq 4f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w4, #2 \n"
"b.lt 3f \n"
// some residual, check to see if it includes a 2x8 block,
// or less
"cmp %w4, #2 \n"
"b.lt 3f \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
"ld2 {v2.h, v3.h}[3], [%0] \n"
// 2x8 block
"2: \n"
"mov %0, %1 \n"
"ld2 {v0.h, v1.h}[0], [%0], %5 \n"
"ld2 {v2.h, v3.h}[0], [%0], %5 \n"
"ld2 {v0.h, v1.h}[1], [%0], %5 \n"
"ld2 {v2.h, v3.h}[1], [%0], %5 \n"
"ld2 {v0.h, v1.h}[2], [%0], %5 \n"
"ld2 {v2.h, v3.h}[2], [%0], %5 \n"
"ld2 {v0.h, v1.h}[3], [%0], %5 \n"
"ld2 {v2.h, v3.h}[3], [%0] \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
"trn2 v5.8b, v0.8b, v2.8b \n"
"trn1 v6.8b, v1.8b, v3.8b \n"
"trn2 v7.8b, v1.8b, v3.8b \n"
"trn1 v4.8b, v0.8b, v2.8b \n"
"trn2 v5.8b, v0.8b, v2.8b \n"
"trn1 v6.8b, v1.8b, v3.8b \n"
"trn2 v7.8b, v1.8b, v3.8b \n"
"mov %0, %2 \n"
"mov %0, %2 \n"
"st1 {v4.d}[0], [%0], %6 \n"
"st1 {v6.d}[0], [%0] \n"
"st1 {v4.d}[0], [%0], %6 \n"
"st1 {v6.d}[0], [%0] \n"
"mov %0, %3 \n"
"mov %0, %3 \n"
"st1 {v5.d}[0], [%0], %7 \n"
"st1 {v7.d}[0], [%0] \n"
"st1 {v5.d}[0], [%0], %7 \n"
"st1 {v7.d}[0], [%0] \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n"
"add %1, %1, #4 \n" // src += 2 * 2
"add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
// dst_stride_a
"add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
// dst_stride_b
"subs %w4, %w4, #2 \n" // w -= 2
"b.eq 4f \n"
// 1x8 block
"3: \n"
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
"ld2 {v0.b, v1.b}[7], [%1] \n"
// 1x8 block
"3: \n"
"ld2 {v0.b, v1.b}[0], [%1], %5 \n"
"ld2 {v0.b, v1.b}[1], [%1], %5 \n"
"ld2 {v0.b, v1.b}[2], [%1], %5 \n"
"ld2 {v0.b, v1.b}[3], [%1], %5 \n"
"ld2 {v0.b, v1.b}[4], [%1], %5 \n"
"ld2 {v0.b, v1.b}[5], [%1], %5 \n"
"ld2 {v0.b, v1.b}[6], [%1], %5 \n"
"ld2 {v0.b, v1.b}[7], [%1] \n"
"st1 {v0.d}[0], [%2] \n"
"st1 {v1.d}[0], [%3] \n"
"st1 {v0.d}[0], [%2] \n"
"st1 {v1.d}[0], [%3] \n"
"4: \n"
"4: \n"
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc",
"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
"v30", "v31"
);
: "=&r"(src_temp), // %0
"+r"(src), // %1
"+r"(dst_a), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "r"(static_cast<ptrdiff_t>(src_stride)), // %5
"r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
"r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
"r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

View File

@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
movdqa xmm7, xmm5
lea eax, [eax + 8 * edi + 16]
neg edi
// Second round of bit swap.
// Second round of bit swap.
movdqa xmm5, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm5, xmm2
@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
punpckhwd xmm6, xmm7
movdqa xmm7, xmm6
// Third round of bit swap.
// Write to the destination pointer.
// Third round of bit swap.
// Write to the destination pointer.
movdqa xmm6, xmm0
punpckldq xmm0, xmm4
punpckhdq xmm6, xmm4

View File

@ -2639,6 +2639,25 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
}
#endif
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
float fmax = 0.f;
int i;
for (i = 0; i < width; ++i) {
float v = *src++ * scale;
*dst++ = v;
fmax = (v > fmax) ? v : fmax;
}
return fmax;
}
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
int i;
for (i = 0; i < width; ++i) {
float v = *src++ * scale;
*dst++ = v;
}
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -2612,6 +2612,53 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
: "cc", "memory", "v1", "v2", "v3");
}
float ScaleSumSamples_NEON(const float* src,
float* dst,
float scale,
int width) {
float fmax;
asm volatile(
"movi v3.4s, #0 \n" // max
"movi v4.4s, #0 \n" // max
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %4.s[0] \n" // scale
"fmul v2.4s, v2.4s, %4.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"fmax v3.4s, v3.4s, v1.4s \n" // max
"fmax v4.4s, v4.4s, v2.4s \n"
"b.gt 1b \n"
"fmax v3.4s, v3.4s, v4.4s \n" // max
"fmaxv %s3, v3.4s \n" // signed max acculator
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width), // %2
"=w"(fmax) // %3
: "w"(scale) // %4
: "cc", "memory", "v1", "v2", "v3", "v4");
return fmax;
}
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
asm volatile(
"1: \n"
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
"subs %w2, %w2, #8 \n" // 8 processed per loop
"fmul v1.4s, v1.4s, %3.s[0] \n" // scale
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "w"(scale) // %3
: "cc", "memory", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
psraw xmm1, 8
packsswb xmm0, xmm1
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0,
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
vpaddb ymm0, ymm0, ymm5 // -> unsigned
// step 3 - store 16 U and 16 V values
// step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
vshufps ymm2, ymm2, ymm3, 0xdd
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 32 different pixels, its 16 pixels of U and 16 of V
vpmaddubsw ymm1, ymm0, ymm7 // U
vpmaddubsw ymm3, ymm2, ymm7
vpmaddubsw ymm0, ymm0, ymm6 // V
@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0,
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
// step 3 - store 16 U and 16 V values
// step 3 - store 16 U and 16 V values
vextractf128 [edx], ymm0, 0 // U
vextractf128 [edx + edi], ymm0, 1 // V
lea edx, [edx + 16]
@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
shufps xmm4, xmm3, 0xdd
pavgb xmm2, xmm4
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 16 different pixels, its 8 pixels of U and 8 of V
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pmaddubsw xmm0, xmm7 // U
@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0,
packsswb xmm0, xmm1
paddb xmm0, xmm5 // -> unsigned
// step 3 - store 8 U and 8 V values
// step 3 - store 8 U and 8 V values
movlps qword ptr [edx], xmm0 // U
movhps qword ptr [edx + edi], xmm0 // V
lea edx, [edx + 8]
@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf,
psrlw xmm0, 6
packuswb xmm0, xmm0 // G
// Step 2: Weave into ARGB
// Step 2: Weave into ARGB
punpcklbw xmm0, xmm0 // GG
movdqa xmm1, xmm0
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf,
vpsrlw ymm0, ymm0, 6
vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
// TODO(fbarchard): Weave alpha with unpack.
// Step 2: Weave into ARGB
vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates
vpermq ymm1, ymm1, 0xd8
vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels
@ -4067,7 +4067,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0,
sub edx, esi
sub edi, esi
// 8 pixel loop.
// 8 pixel loop.
convertloop8:
movq xmm0, qword ptr [esi] // alpha
punpcklbw xmm0, xmm0
@ -4123,7 +4123,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0,
sub edx, esi
sub edi, esi
// 32 pixel loop.
// 32 pixel loop.
convertloop32:
vmovdqu ymm0, [esi] // alpha
vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31
@ -4183,7 +4183,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
sub ecx, 4
jl convertloop4b // less than 4 pixels?
// 4 pixel loop.
// 4 pixel loop.
convertloop4:
movdqu xmm3, [eax] // src argb
lea eax, [eax + 16]
@ -4212,7 +4212,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0,
add ecx, 4 - 1
jl convertloop1b
// 1 pixel loop.
// 1 pixel loop.
convertloop1:
movd xmm3, [eax] // src argb
lea eax, [eax + 4]
@ -5256,7 +5256,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
cvtps2dq xmm5, xmm5 // 0.16 fixed point
packssdw xmm5, xmm5 // 16 bit shorts
// 4 pixel loop small blocks.
// 4 pixel loop small blocks.
s4:
// top left
movdqu xmm0, [eax]
@ -5298,7 +5298,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
jmp l4b
// 4 pixel loop
// 4 pixel loop
l4:
// top left
movdqu xmm0, [eax]
@ -5350,7 +5350,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@ -5392,7 +5392,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
test edx, 15
jne l4b
// 4 pixel loop
// 4 pixel loop
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@ -5438,7 +5438,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4]
@ -5481,7 +5481,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
sub ecx, 4
jl l4b
// setup for 4 pixel loop
// setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1
@ -5493,7 +5493,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop
// 4 pixel loop
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
@ -5524,7 +5524,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts
@ -5598,7 +5598,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
jg xloop
jmp xloop99
// Blend 50 / 50.
// Blend 50 / 50.
xloop50:
vmovdqu ymm0, [esi]
vpavgb ymm0, ymm0, [esi + edx]
@ -5608,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr,
jg xloop50
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
// Blend 100 / 0 - Copy row unchanged.
xloop100:
rep movsb
@ -5638,7 +5638,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
// Dispatch to specialized filters if applicable.
// Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
@ -5678,7 +5678,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
jg xloop
jmp xloop99
// Blend 50 / 50.
// Blend 50 / 50.
xloop50:
movdqu xmm0, [esi]
movdqu xmm1, [esi + edx]
@ -5689,7 +5689,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr,
jg xloop50
jmp xloop99
// Blend 100 / 0 - Copy row unchanged.
// Blend 100 / 0 - Copy row unchanged.
xloop100:
movdqu xmm0, [esi]
movdqu [esi + edi], xmm0
@ -5784,7 +5784,7 @@ __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb,
cmp ebx, 0x02010003
je shuf_2103
// TODO(fbarchard): Use one source pointer and 3 offsets.
// TODO(fbarchard): Use one source pointer and 3 offsets.
shuf_any1:
movzx ebx, byte ptr [esi]
movzx ebx, byte ptr [eax + ebx]
@ -5971,7 +5971,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb,
mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop.
// 2 pixel loop.
convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
@ -6072,7 +6072,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src,
pxor xmm5, xmm5
sub edx, eax
// 8 pixel loop.
// 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16
@ -6110,7 +6110,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src,
vpxor ymm5, ymm5, ymm5
sub edx, eax
// 16 pixel loop.
// 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
add eax, 32
@ -6144,7 +6144,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src,
mov ecx, [esp + 16] /* width */
sub edx, eax
// 16 pixel loop.
// 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
@ -6252,7 +6252,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
psllw xmm4, 8
pxor xmm5, xmm5
// 4 pixel loop.
// 4 pixel loop.
convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3

File diff suppressed because it is too large Load Diff

View File

@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
mov ecx, [esp + 12] // src_width
pxor xmm5, xmm5
// sum rows
// sum rows
xloop:
movdqu xmm3, [eax] // read 16 bytes
lea eax, [eax + 16]
@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
mov ecx, [esp + 12] // src_width
vpxor ymm5, ymm5, ymm5
// sum rows
// sum rows
xloop:
vmovdqu ymm3, [eax] // read 32 bytes
lea eax, [eax + 32]
@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
// 1 pixel remainder
movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
movd xmm0, ebx
psrlw xmm2, 9 // 7 bit fractions.
@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
sub ecx, 4
jl xloop49
// 4 Pixel loop.
// 4 Pixel loop.
xloop4:
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
test ecx, 2
je xloop29
// 2 Pixels.
// 2 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x0 pixels
movd xmm1, [esi + edx * 4] // 1 source x1 pixels
pextrw eax, xmm2, 5 // get x2 integer.
@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
test ecx, 1
je xloop99
// 1 Pixels.
// 1 Pixels.
movd xmm0, [esi + eax * 4] // 1 source x2 pixels
movd dword ptr [edi], xmm0
xloop99:
@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
add ecx, 2 - 1
jl xloop99
// 1 pixel remainder
// 1 pixel remainder
psrlw xmm2, 9 // 7 bit fractions.
movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
pshufb xmm2, xmm5 // 00000000

View File

@ -11,6 +11,9 @@
#include <stdlib.h>
#include <time.h>
// row.h defines SIMD_ALIGNED, overriding unit_test.h
#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
#include "../unit_test/unit_test.h"
#include "libyuv/compare.h"
#include "libyuv/convert.h"
@ -2518,4 +2521,146 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
float TestScaleSumSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
float scale,
bool opt) {
int i, j;
float max_c, max_opt;
const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8* dst_opt = orig_y + y_plane_size;
uint8* dst_c = orig_y + y_plane_size * 2;
// Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size);
for (i = 0; i < y_plane_size / 4; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
}
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESUMSAMPLES_NEON
if (opt) {
max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
} else {
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
}
#else
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
}
float max_diff = 0;
for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) {
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, false);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) {
float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, true);
EXPECT_EQ(0, diff);
}
float TestScaleSamples(int benchmark_width,
int benchmark_height,
int benchmark_iterations,
float scale,
bool opt) {
int i, j;
const int y_plane_size = benchmark_width * benchmark_height * 4;
align_buffer_page_end(orig_y, y_plane_size * 3);
uint8* dst_opt = orig_y + y_plane_size;
uint8* dst_c = orig_y + y_plane_size * 2;
// Randomize works but may contain some denormals affecting performance.
// MemRandomize(orig_y, y_plane_size);
for (i = 0; i < y_plane_size / 4; ++i) {
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
}
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 1, y_plane_size);
// Disable all optimizations.
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_c), scale,
benchmark_width * benchmark_height);
// Enable optimizations.
for (j = 0; j < benchmark_iterations; j++) {
#ifdef HAS_SCALESAMPLES_NEON
if (opt) {
max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
} else {
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
}
#else
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
reinterpret_cast<float*>(dst_opt), scale,
benchmark_width * benchmark_height);
#endif
}
float max_diff = 0;
for (i = 0; i < y_plane_size / 4; ++i) {
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
(reinterpret_cast<float*>(dst_opt)[i]));
if (abs_diff > max_diff) {
max_diff = abs_diff;
}
}
free_aligned_buffer_page_end(orig_y);
return max_diff;
}
TEST_F(LibYUVPlanarTest, TestScaleSamples_C) {
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, false);
EXPECT_EQ(0, diff);
}
TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
float diff = TestScaleSamples(benchmark_width_, benchmark_height_,
benchmark_iterations_, 1.2f, true);
EXPECT_EQ(0, diff);
}
} // namespace libyuv

View File

@ -36,6 +36,9 @@ static __inline int Abs(int v) {
return v >= 0 ? v : -v;
}
static __inline float FAbs(float v) {
return v >= 0 ? v : -v;
}
#define OFFBY 0
// Scaling uses 16.16 fixed point to step thru the source image, so a
@ -70,8 +73,11 @@ static inline bool SizeValid(int src_width,
uint8* var; \
uint8* var##_mem; \
var##_mem = reinterpret_cast<uint8*>(malloc(((size) + 4095 + 63) & ~4095)); \
var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & /* NOLINT */ \
~4095) - (size)) & ~63);
var = (uint8*)((intptr_t)(var##_mem + \
(((size) + 4095 + 63) & /* NOLINT */ \
~4095) - \
(size)) & \
~63);
#define free_aligned_buffer_page_end(var) \
free(var##_mem); \