From 8676ad7004fbe86a855923938e4db6a83fe40b91 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 4 Aug 2017 16:05:25 -0700 Subject: [PATCH] scale float samples and return max value BUG=libyuv:717 TEST=ScaleSum unittest to compare C vs Arm implementation TBR=kjellander@chromium.org Change-Id: Iaa7af5547d979aad4722f868d31b405340115748 Reviewed-on: https://chromium-review.googlesource.com/600534 Reviewed-by: Cheng Wang --- README.chromium | 2 +- include/libyuv/row.h | 13 + include/libyuv/version.h | 2 +- source/compare_neon.cc | 98 ++-- source/compare_neon64.cc | 90 ++- source/rotate_neon64.cc | 526 ++++++++--------- source/rotate_win.cc | 6 +- source/row_common.cc | 19 + source/row_neon64.cc | 47 ++ source/row_win.cc | 108 ++-- source/scale_neon64.cc | 1196 +++++++++++++++++++------------------- source/scale_win.cc | 14 +- unit_test/planar_test.cc | 145 +++++ unit_test/unit_test.h | 10 +- 14 files changed, 1245 insertions(+), 1031 deletions(-) diff --git a/README.chromium b/README.chromium index 65fef7501..13a188fe8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1663 +Version: 1664 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index e93faa849..aefe2add4 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -359,6 +359,11 @@ extern "C" { #define HAS_SOBELYROW_NEON #endif +// The following are available on AArch64 platforms: +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_SCALESUMSAMPLES_NEON +#endif + // The following are available on Mips platforms: #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \ (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6) @@ -3152,6 +3157,14 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, const uint8* luma, uint32 lumacoeff); +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width); +void ScaleSamples_C(const float* src, float* dst, float scale, int width); +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e3b978ab6..97ced6a7a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1663 +#define LIBYUV_VERSION 1664 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 0f374a70f..2c01f5c1e 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -26,67 +26,61 @@ extern "C" { uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 diff; - asm volatile ( - "vmov.u16 q4, #0 \n" // accumulator + asm volatile( + "vmov.u16 q4, #0 \n" // accumulator - "1: \n" - "vld1.8 {q0, q1}, [%0]! \n" - "vld1.8 {q2, q3}, [%1]! \n" - "veor.32 q0, q0, q2 \n" - "veor.32 q1, q1, q3 \n" - "vcnt.i8 q0, q0 \n" - "vcnt.i8 q1, q1 \n" - "subs %2, %2, #32 \n" - "vadd.u8 q0, q0, q1 \n" // 16 byte counts - "vpadal.u8 q4, q0 \n" // 8 shorts - "bgt 1b \n" + "1: \n" + "vld1.8 {q0, q1}, [%0]! \n" + "vld1.8 {q2, q3}, [%1]! \n" + "veor.32 q0, q0, q2 \n" + "veor.32 q1, q1, q3 \n" + "vcnt.i8 q0, q0 \n" + "vcnt.i8 q1, q1 \n" + "subs %2, %2, #32 \n" + "vadd.u8 q0, q0, q1 \n" // 16 byte counts + "vpadal.u8 q4, q0 \n" // 8 shorts + "bgt 1b \n" - "vpaddl.u16 q0, q4 \n" // 4 ints - "vpadd.u32 d0, d0, d1 \n" - "vpadd.u32 d0, d0, d0 \n" - "vmov.32 %3, d0[0] \n" - - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(diff) - : - : "cc", "q0", "q1", "q2", "q3", "q4"); + "vpaddl.u16 q0, q4 \n" // 4 ints + "vpadd.u32 d0, d0, d1 \n" + "vpadd.u32 d0, d0, d0 \n" + "vmov.32 %3, d0[0] \n" + + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "q0", "q1", "q2", "q3", "q4"); return diff; } uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; - asm volatile ( - "vmov.u8 q8, #0 \n" - "vmov.u8 q10, #0 \n" - "vmov.u8 q9, #0 \n" - "vmov.u8 q11, #0 \n" + asm volatile( + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q11, #0 \n" - "1: \n" - "vld1.8 {q0}, [%0]! \n" - "vld1.8 {q1}, [%1]! \n" - "subs %2, %2, #16 \n" - "vsubl.u8 q2, d0, d2 \n" - "vsubl.u8 q3, d1, d3 \n" - "vmlal.s16 q8, d4, d4 \n" - "vmlal.s16 q9, d6, d6 \n" - "vmlal.s16 q10, d5, d5 \n" - "vmlal.s16 q11, d7, d7 \n" - "bgt 1b \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" + "subs %2, %2, #16 \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q8, d4, d4 \n" + "vmlal.s16 q9, d6, d6 \n" + "vmlal.s16 q10, d5, d5 \n" + "vmlal.s16 q11, d7, d7 \n" + "bgt 1b \n" - "vadd.u32 q8, q8, q9 \n" - "vadd.u32 q10, q10, q11 \n" - "vadd.u32 q11, q8, q10 \n" - "vpaddl.u32 q1, q11 \n" - "vadd.u64 d0, d2, d3 \n" - "vmov.32 %3, d0[0] \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); + "vadd.u32 q8, q8, q9 \n" + "vadd.u32 q10, q10, q11 \n" + "vadd.u32 q11, q8, q10 \n" + "vpaddl.u32 q1, q11 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); return sse; } diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index da404cc96..bedb8d1b0 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -24,63 +24,57 @@ extern "C" { // uses short accumulator which restricts count to 131 KB uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 diff; - asm volatile ( - "movi v4.8h, #0 \n" + asm volatile( + "movi v4.8h, #0 \n" - "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" - "eor v0.16b, v0.16b, v2.16b \n" - "eor v1.16b, v1.16b, v3.16b \n" - "cnt v0.16b, v0.16b \n" - "cnt v1.16b, v1.16b \n" - "subs %w2, %w2, #32 \n" - "add v0.16b, v0.16b, v1.16b \n" - "uadalp v4.8h, v0.16b \n" - "b.gt 1b \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" + "eor v0.16b, v0.16b, v2.16b \n" + "eor v1.16b, v1.16b, v3.16b \n" + "cnt v0.16b, v0.16b \n" + "cnt v1.16b, v1.16b \n" + "subs %w2, %w2, #32 \n" + "add v0.16b, v0.16b, v1.16b \n" + "uadalp v4.8h, v0.16b \n" + "b.gt 1b \n" - "uaddlv s4, v4.8h \n" - "fmov %w3, s4 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(diff) - : - : "cc", "v0", "v1", "v2", "v3", "v4"); + "uaddlv s4, v4.8h \n" + "fmov %w3, s4 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) + : + : "cc", "v0", "v1", "v2", "v3", "v4"); return diff; } uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { uint32 sse; - asm volatile ( - "eor v16.16b, v16.16b, v16.16b \n" - "eor v18.16b, v18.16b, v18.16b \n" - "eor v17.16b, v17.16b, v17.16b \n" - "eor v19.16b, v19.16b, v19.16b \n" + asm volatile( + "eor v16.16b, v16.16b, v16.16b \n" + "eor v18.16b, v18.16b, v18.16b \n" + "eor v17.16b, v17.16b, v17.16b \n" + "eor v19.16b, v19.16b, v19.16b \n" - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" - "ld1 {v1.16b}, [%1], #16 \n" - "subs %w2, %w2, #16 \n" - "usubl v2.8h, v0.8b, v1.8b \n" - "usubl2 v3.8h, v0.16b, v1.16b \n" - "smlal v16.4s, v2.4h, v2.4h \n" - "smlal v17.4s, v3.4h, v3.4h \n" - "smlal2 v18.4s, v2.8h, v2.8h \n" - "smlal2 v19.4s, v3.8h, v3.8h \n" - "b.gt 1b \n" + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" + "ld1 {v1.16b}, [%1], #16 \n" + "subs %w2, %w2, #16 \n" + "usubl v2.8h, v0.8b, v1.8b \n" + "usubl2 v3.8h, v0.16b, v1.16b \n" + "smlal v16.4s, v2.4h, v2.4h \n" + "smlal v17.4s, v3.4h, v3.4h \n" + "smlal2 v18.4s, v2.8h, v2.8h \n" + "smlal2 v19.4s, v3.8h, v3.8h \n" + "b.gt 1b \n" - "add v16.4s, v16.4s, v17.4s \n" - "add v18.4s, v18.4s, v19.4s \n" - "add v19.4s, v16.4s, v18.4s \n" - "addv s0, v19.4s \n" - "fmov %w3, s0 \n" - : "+r"(src_a), - "+r"(src_b), - "+r"(count), - "=r"(sse) - : - : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + "add v16.4s, v16.4s, v17.4s \n" + "add v18.4s, v18.4s, v19.4s \n" + "add v19.4s, v16.4s, v18.4s \n" + "addv s0, v19.4s \n" + "fmov %w3, s0 \n" + : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) + : + : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index 137336a07..59db7f9f3 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -30,14 +30,14 @@ void TransposeWx8_NEON(const uint8* src, int dst_stride, int width) { const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %w3, %w3, #8 \n" + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w3, %w3, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" "mov %0, %1 \n" "ld1 {v0.8b}, [%0], %5 \n" @@ -92,109 +92,108 @@ void TransposeWx8_NEON(const uint8* src, "subs %w3, %w3, #8 \n" // w -= 8 "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %w3, %w3, #8 \n" - "b.eq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w3, %w3, #8 \n" + "b.eq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %w3, #2 \n" - "b.lt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %w3, #2 \n" + "b.lt 3f \n" - "cmp %w3, #4 \n" - "b.lt 2f \n" + "cmp %w3, #4 \n" + "b.lt 2f \n" - // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.s}[0], [%0], %5 \n" - "ld1 {v0.s}[1], [%0], %5 \n" - "ld1 {v0.s}[2], [%0], %5 \n" - "ld1 {v0.s}[3], [%0], %5 \n" - "ld1 {v1.s}[0], [%0], %5 \n" - "ld1 {v1.s}[1], [%0], %5 \n" - "ld1 {v1.s}[2], [%0], %5 \n" - "ld1 {v1.s}[3], [%0] \n" + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.s}[0], [%0], %5 \n" + "ld1 {v0.s}[1], [%0], %5 \n" + "ld1 {v0.s}[2], [%0], %5 \n" + "ld1 {v0.s}[3], [%0], %5 \n" + "ld1 {v1.s}[0], [%0], %5 \n" + "ld1 {v1.s}[1], [%0], %5 \n" + "ld1 {v1.s}[2], [%0], %5 \n" + "ld1 {v1.s}[3], [%0] \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "ld1 {v2.16b}, [%4] \n" + "ld1 {v2.16b}, [%4] \n" - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" + "tbl v3.16b, {v0.16b}, v2.16b \n" + "tbl v0.16b, {v1.16b}, v2.16b \n" - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - "st1 {v3.s}[0], [%0], %6 \n" - "st1 {v3.s}[1], [%0], %6 \n" - "st1 {v3.s}[2], [%0], %6 \n" - "st1 {v3.s}[3], [%0] \n" + // TODO(frkoenig): Rework shuffle above to + // write out with 4 instead of 8 writes. + "st1 {v3.s}[0], [%0], %6 \n" + "st1 {v3.s}[1], [%0], %6 \n" + "st1 {v3.s}[2], [%0], %6 \n" + "st1 {v3.s}[3], [%0] \n" - "add %0, %2, #4 \n" - "st1 {v0.s}[0], [%0], %6 \n" - "st1 {v0.s}[1], [%0], %6 \n" - "st1 {v0.s}[2], [%0], %6 \n" - "st1 {v0.s}[3], [%0] \n" + "add %0, %2, #4 \n" + "st1 {v0.s}[0], [%0], %6 \n" + "st1 {v0.s}[1], [%0], %6 \n" + "st1 {v0.s}[2], [%0], %6 \n" + "st1 {v0.s}[3], [%0] \n" - "add %1, %1, #4 \n" // src += 4 - "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %w3, %w3, #4 \n" // w -= 4 - "b.eq 4f \n" + "add %1, %1, #4 \n" // src += 4 + "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride + "subs %w3, %w3, #4 \n" // w -= 4 + "b.eq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %w3, #2 \n" - "b.lt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w3, #2 \n" + "b.lt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "ld1 {v0.h}[0], [%0], %5 \n" - "ld1 {v1.h}[0], [%0], %5 \n" - "ld1 {v0.h}[1], [%0], %5 \n" - "ld1 {v1.h}[1], [%0], %5 \n" - "ld1 {v0.h}[2], [%0], %5 \n" - "ld1 {v1.h}[2], [%0], %5 \n" - "ld1 {v0.h}[3], [%0], %5 \n" - "ld1 {v1.h}[3], [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld1 {v0.h}[0], [%0], %5 \n" + "ld1 {v1.h}[0], [%0], %5 \n" + "ld1 {v0.h}[1], [%0], %5 \n" + "ld1 {v1.h}[1], [%0], %5 \n" + "ld1 {v0.h}[2], [%0], %5 \n" + "ld1 {v1.h}[2], [%0], %5 \n" + "ld1 {v0.h}[3], [%0], %5 \n" + "ld1 {v1.h}[3], [%0] \n" - "trn2 v2.8b, v0.8b, v1.8b \n" - "trn1 v3.8b, v0.8b, v1.8b \n" + "trn2 v2.8b, v0.8b, v1.8b \n" + "trn1 v3.8b, v0.8b, v1.8b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "st1 {v3.8b}, [%0], %6 \n" - "st1 {v2.8b}, [%0] \n" + "st1 {v3.8b}, [%0], %6 \n" + "st1 {v2.8b}, [%0] \n" - "add %1, %1, #2 \n" // src += 2 - "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %w3, %w3, #2 \n" // w -= 2 - "b.eq 4f \n" + "add %1, %1, #2 \n" // src += 2 + "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride + "subs %w3, %w3, #2 \n" // w -= 2 + "b.eq 4f \n" - // 1x8 block - "3: \n" - "ld1 {v0.b}[0], [%1], %5 \n" - "ld1 {v0.b}[1], [%1], %5 \n" - "ld1 {v0.b}[2], [%1], %5 \n" - "ld1 {v0.b}[3], [%1], %5 \n" - "ld1 {v0.b}[4], [%1], %5 \n" - "ld1 {v0.b}[5], [%1], %5 \n" - "ld1 {v0.b}[6], [%1], %5 \n" - "ld1 {v0.b}[7], [%1] \n" + // 1x8 block + "3: \n" + "ld1 {v0.b}[0], [%1], %5 \n" + "ld1 {v0.b}[1], [%1], %5 \n" + "ld1 {v0.b}[2], [%1], %5 \n" + "ld1 {v0.b}[3], [%1], %5 \n" + "ld1 {v0.b}[4], [%1], %5 \n" + "ld1 {v0.b}[5], [%1], %5 \n" + "ld1 {v0.b}[6], [%1], %5 \n" + "ld1 {v0.b}[7], [%1] \n" - "st1 {v0.8b}, [%2] \n" + "st1 {v0.8b}, [%2] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride)) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst), // %2 + "+r"(width) // %3 + : "r"(&kVTbl4x4Transpose), // %4 + "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride)) // %6 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } static uint8 kVTbl4x4TransposeDi[32] = { @@ -209,212 +208,215 @@ void TransposeUVWx8_NEON(const uint8* src, int dst_stride_b, int width) { const uint8* src_temp; - asm volatile ( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %w4, %w4, #8 \n" + asm volatile( + // loops are on blocks of 8. loop will stop when + // counter gets to or below 0. starting the counter + // at w-8 allow for this + "sub %w4, %w4, #8 \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" + // handle 8x8 blocks. this should be the majority of the plane + "1: \n" + "mov %0, %1 \n" - "ld1 {v0.16b}, [%0], %5 \n" - "ld1 {v1.16b}, [%0], %5 \n" - "ld1 {v2.16b}, [%0], %5 \n" - "ld1 {v3.16b}, [%0], %5 \n" - "ld1 {v4.16b}, [%0], %5 \n" - "ld1 {v5.16b}, [%0], %5 \n" - "ld1 {v6.16b}, [%0], %5 \n" - "ld1 {v7.16b}, [%0] \n" + "ld1 {v0.16b}, [%0], %5 \n" + "ld1 {v1.16b}, [%0], %5 \n" + "ld1 {v2.16b}, [%0], %5 \n" + "ld1 {v3.16b}, [%0], %5 \n" + "ld1 {v4.16b}, [%0], %5 \n" + "ld1 {v5.16b}, [%0], %5 \n" + "ld1 {v6.16b}, [%0], %5 \n" + "ld1 {v7.16b}, [%0] \n" - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "st1 {v16.d}[0], [%0], %6 \n" - "st1 {v18.d}[0], [%0], %6 \n" - "st1 {v17.d}[0], [%0], %6 \n" - "st1 {v19.d}[0], [%0], %6 \n" - "st1 {v16.d}[1], [%0], %6 \n" - "st1 {v18.d}[1], [%0], %6 \n" - "st1 {v17.d}[1], [%0], %6 \n" - "st1 {v19.d}[1], [%0] \n" + "st1 {v16.d}[0], [%0], %6 \n" + "st1 {v18.d}[0], [%0], %6 \n" + "st1 {v17.d}[0], [%0], %6 \n" + "st1 {v19.d}[0], [%0], %6 \n" + "st1 {v16.d}[1], [%0], %6 \n" + "st1 {v18.d}[1], [%0], %6 \n" + "st1 {v17.d}[1], [%0], %6 \n" + "st1 {v19.d}[1], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - "st1 {v20.d}[0], [%0], %7 \n" - "st1 {v22.d}[0], [%0], %7 \n" - "st1 {v21.d}[0], [%0], %7 \n" - "st1 {v23.d}[0], [%0], %7 \n" - "st1 {v20.d}[1], [%0], %7 \n" - "st1 {v22.d}[1], [%0], %7 \n" - "st1 {v21.d}[1], [%0], %7 \n" - "st1 {v23.d}[1], [%0] \n" + "st1 {v20.d}[0], [%0], %7 \n" + "st1 {v22.d}[0], [%0], %7 \n" + "st1 {v21.d}[0], [%0], %7 \n" + "st1 {v23.d}[0], [%0], %7 \n" + "st1 {v20.d}[1], [%0], %7 \n" + "st1 {v22.d}[1], [%0], %7 \n" + "st1 {v21.d}[1], [%0], %7 \n" + "st1 {v23.d}[1], [%0] \n" - "add %1, %1, #16 \n" // src += 8*2 - "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a - "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b - "subs %w4, %w4, #8 \n" // w -= 8 - "b.ge 1b \n" + "add %1, %1, #16 \n" // src += 8*2 + "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * + // dst_stride_a + "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * + // dst_stride_b + "subs %w4, %w4, #8 \n" // w -= 8 + "b.ge 1b \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %w4, %w4, #8 \n" - "b.eq 4f \n" + // add 8 back to counter. if the result is 0 there are + // no residuals. + "adds %w4, %w4, #8 \n" + "b.eq 4f \n" - // some residual, so between 1 and 7 lines left to transpose - "cmp %w4, #2 \n" - "b.lt 3f \n" + // some residual, so between 1 and 7 lines left to transpose + "cmp %w4, #2 \n" + "b.lt 3f \n" - "cmp %w4, #4 \n" - "b.lt 2f \n" + "cmp %w4, #4 \n" + "b.lt 2f \n" - // TODO(frkoenig): Clean this up - // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" + // TODO(frkoenig): Clean this up + // 4x8 block + "mov %0, %1 \n" + "ld1 {v0.8b}, [%0], %5 \n" + "ld1 {v1.8b}, [%0], %5 \n" + "ld1 {v2.8b}, [%0], %5 \n" + "ld1 {v3.8b}, [%0], %5 \n" + "ld1 {v4.8b}, [%0], %5 \n" + "ld1 {v5.8b}, [%0], %5 \n" + "ld1 {v6.8b}, [%0], %5 \n" + "ld1 {v7.8b}, [%0] \n" - "ld1 {v30.16b}, [%8], #16 \n" - "ld1 {v31.16b}, [%8] \n" + "ld1 {v30.16b}, [%8], #16 \n" + "ld1 {v31.16b}, [%8] \n" - "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" - "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" - "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" - "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" + "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" + "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" + "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" + "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "st1 {v16.s}[0], [%0], %6 \n" - "st1 {v16.s}[1], [%0], %6 \n" - "st1 {v16.s}[2], [%0], %6 \n" - "st1 {v16.s}[3], [%0], %6 \n" + "st1 {v16.s}[0], [%0], %6 \n" + "st1 {v16.s}[1], [%0], %6 \n" + "st1 {v16.s}[2], [%0], %6 \n" + "st1 {v16.s}[3], [%0], %6 \n" - "add %0, %2, #4 \n" - "st1 {v18.s}[0], [%0], %6 \n" - "st1 {v18.s}[1], [%0], %6 \n" - "st1 {v18.s}[2], [%0], %6 \n" - "st1 {v18.s}[3], [%0] \n" + "add %0, %2, #4 \n" + "st1 {v18.s}[0], [%0], %6 \n" + "st1 {v18.s}[1], [%0], %6 \n" + "st1 {v18.s}[2], [%0], %6 \n" + "st1 {v18.s}[3], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - "st1 {v17.s}[0], [%0], %7 \n" - "st1 {v17.s}[1], [%0], %7 \n" - "st1 {v17.s}[2], [%0], %7 \n" - "st1 {v17.s}[3], [%0], %7 \n" + "st1 {v17.s}[0], [%0], %7 \n" + "st1 {v17.s}[1], [%0], %7 \n" + "st1 {v17.s}[2], [%0], %7 \n" + "st1 {v17.s}[3], [%0], %7 \n" - "add %0, %3, #4 \n" - "st1 {v19.s}[0], [%0], %7 \n" - "st1 {v19.s}[1], [%0], %7 \n" - "st1 {v19.s}[2], [%0], %7 \n" - "st1 {v19.s}[3], [%0] \n" + "add %0, %3, #4 \n" + "st1 {v19.s}[0], [%0], %7 \n" + "st1 {v19.s}[1], [%0], %7 \n" + "st1 {v19.s}[2], [%0], %7 \n" + "st1 {v19.s}[3], [%0] \n" - "add %1, %1, #8 \n" // src += 4 * 2 - "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a - "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b - "subs %w4, %w4, #4 \n" // w -= 4 - "b.eq 4f \n" + "add %1, %1, #8 \n" // src += 4 * 2 + "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * + // dst_stride_a + "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * + // dst_stride_b + "subs %w4, %w4, #4 \n" // w -= 4 + "b.eq 4f \n" - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %w4, #2 \n" - "b.lt 3f \n" + // some residual, check to see if it includes a 2x8 block, + // or less + "cmp %w4, #2 \n" + "b.lt 3f \n" - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "ld2 {v0.h, v1.h}[0], [%0], %5 \n" - "ld2 {v2.h, v3.h}[0], [%0], %5 \n" - "ld2 {v0.h, v1.h}[1], [%0], %5 \n" - "ld2 {v2.h, v3.h}[1], [%0], %5 \n" - "ld2 {v0.h, v1.h}[2], [%0], %5 \n" - "ld2 {v2.h, v3.h}[2], [%0], %5 \n" - "ld2 {v0.h, v1.h}[3], [%0], %5 \n" - "ld2 {v2.h, v3.h}[3], [%0] \n" + // 2x8 block + "2: \n" + "mov %0, %1 \n" + "ld2 {v0.h, v1.h}[0], [%0], %5 \n" + "ld2 {v2.h, v3.h}[0], [%0], %5 \n" + "ld2 {v0.h, v1.h}[1], [%0], %5 \n" + "ld2 {v2.h, v3.h}[1], [%0], %5 \n" + "ld2 {v0.h, v1.h}[2], [%0], %5 \n" + "ld2 {v2.h, v3.h}[2], [%0], %5 \n" + "ld2 {v0.h, v1.h}[3], [%0], %5 \n" + "ld2 {v2.h, v3.h}[3], [%0] \n" - "trn1 v4.8b, v0.8b, v2.8b \n" - "trn2 v5.8b, v0.8b, v2.8b \n" - "trn1 v6.8b, v1.8b, v3.8b \n" - "trn2 v7.8b, v1.8b, v3.8b \n" + "trn1 v4.8b, v0.8b, v2.8b \n" + "trn2 v5.8b, v0.8b, v2.8b \n" + "trn1 v6.8b, v1.8b, v3.8b \n" + "trn2 v7.8b, v1.8b, v3.8b \n" - "mov %0, %2 \n" + "mov %0, %2 \n" - "st1 {v4.d}[0], [%0], %6 \n" - "st1 {v6.d}[0], [%0] \n" + "st1 {v4.d}[0], [%0], %6 \n" + "st1 {v6.d}[0], [%0] \n" - "mov %0, %3 \n" + "mov %0, %3 \n" - "st1 {v5.d}[0], [%0], %7 \n" - "st1 {v7.d}[0], [%0] \n" + "st1 {v5.d}[0], [%0], %7 \n" + "st1 {v7.d}[0], [%0] \n" - "add %1, %1, #4 \n" // src += 2 * 2 - "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a - "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b - "subs %w4, %w4, #2 \n" // w -= 2 - "b.eq 4f \n" + "add %1, %1, #4 \n" // src += 2 * 2 + "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * + // dst_stride_a + "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * + // dst_stride_b + "subs %w4, %w4, #2 \n" // w -= 2 + "b.eq 4f \n" - // 1x8 block - "3: \n" - "ld2 {v0.b, v1.b}[0], [%1], %5 \n" - "ld2 {v0.b, v1.b}[1], [%1], %5 \n" - "ld2 {v0.b, v1.b}[2], [%1], %5 \n" - "ld2 {v0.b, v1.b}[3], [%1], %5 \n" - "ld2 {v0.b, v1.b}[4], [%1], %5 \n" - "ld2 {v0.b, v1.b}[5], [%1], %5 \n" - "ld2 {v0.b, v1.b}[6], [%1], %5 \n" - "ld2 {v0.b, v1.b}[7], [%1] \n" + // 1x8 block + "3: \n" + "ld2 {v0.b, v1.b}[0], [%1], %5 \n" + "ld2 {v0.b, v1.b}[1], [%1], %5 \n" + "ld2 {v0.b, v1.b}[2], [%1], %5 \n" + "ld2 {v0.b, v1.b}[3], [%1], %5 \n" + "ld2 {v0.b, v1.b}[4], [%1], %5 \n" + "ld2 {v0.b, v1.b}[5], [%1], %5 \n" + "ld2 {v0.b, v1.b}[6], [%1], %5 \n" + "ld2 {v0.b, v1.b}[7], [%1] \n" - "st1 {v0.d}[0], [%2] \n" - "st1 {v1.d}[0], [%3] \n" + "st1 {v0.d}[0], [%2] \n" + "st1 {v1.d}[0], [%3] \n" - "4: \n" + "4: \n" - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst_a), // %2 - "+r"(dst_b), // %3 - "+r"(width) // %4 - : "r"(static_cast(src_stride)), // %5 - "r"(static_cast(dst_stride_a)), // %6 - "r"(static_cast(dst_stride_b)), // %7 - "r"(&kVTbl4x4TransposeDi) // %8 - : "memory", "cc", - "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v30", "v31" - ); + : "=&r"(src_temp), // %0 + "+r"(src), // %1 + "+r"(dst_a), // %2 + "+r"(dst_b), // %3 + "+r"(width) // %4 + : "r"(static_cast(src_stride)), // %5 + "r"(static_cast(dst_stride_a)), // %6 + "r"(static_cast(dst_stride_b)), // %7 + "r"(&kVTbl4x4TransposeDi) // %8 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) diff --git a/source/rotate_win.cc b/source/rotate_win.cc index 93a5c28a4..ee523a0b9 100644 --- a/source/rotate_win.cc +++ b/source/rotate_win.cc @@ -172,7 +172,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi - // Second round of bit swap. + // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 @@ -192,8 +192,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 - // Third round of bit swap. - // Write to the destination pointer. + // Third round of bit swap. + // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 diff --git a/source/row_common.cc b/source/row_common.cc index bf953eeff..b02aa2b5d 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2639,6 +2639,25 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y, } #endif +float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { + float fmax = 0.f; + int i; + for (i = 0; i < width; ++i) { + float v = *src++ * scale; + *dst++ = v; + fmax = (v > fmax) ? v : fmax; + } + return fmax; +} + +void ScaleSamples_C(const float* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float v = *src++ * scale; + *dst++ = v; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 2f4d149d6..456c6ea5a 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2612,6 +2612,53 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { : "cc", "memory", "v1", "v2", "v3"); } +float ScaleSumSamples_NEON(const float* src, + float* dst, + float scale, + int width) { + float fmax; + asm volatile( + "movi v3.4s, #0 \n" // max + "movi v4.4s, #0 \n" // max + + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %4.s[0] \n" // scale + "fmul v2.4s, v2.4s, %4.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "fmax v3.4s, v3.4s, v1.4s \n" // max + "fmax v4.4s, v4.4s, v2.4s \n" + "b.gt 1b \n" + "fmax v3.4s, v3.4s, v4.4s \n" // max + "fmaxv %s3, v3.4s \n" // signed max acculator + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width), // %2 + "=w"(fmax) // %3 + : "w"(scale) // %4 + : "cc", "memory", "v1", "v2", "v3", "v4"); + return fmax; +} + +void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { + asm volatile( + "1: \n" + "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples + "subs %w2, %w2, #8 \n" // 8 processed per loop + "fmul v1.4s, v1.4s, %3.s[0] \n" // scale + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 202f2b8d4..03a7e9506 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1410,9 +1410,9 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1426,7 +1426,7 @@ __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1482,9 +1482,9 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1499,7 +1499,7 @@ __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, psraw xmm1, 8 packsswb xmm0, xmm1 - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1549,9 +1549,9 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1565,7 +1565,7 @@ __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw vpaddb ymm0, ymm0, ymm5 // -> unsigned - // step 3 - store 16 U and 16 V values + // step 3 - store 16 U and 16 V values vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] @@ -1617,9 +1617,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 32 different pixels, its 16 pixels of U and 16 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V @@ -1634,7 +1634,7 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw - // step 3 - store 16 U and 16 V values + // step 3 - store 16 U and 16 V values vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] @@ -1750,9 +1750,9 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1766,7 +1766,7 @@ __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1822,9 +1822,9 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1838,7 +1838,7 @@ __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -1894,9 +1894,9 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 - // step 2 - convert to U and V - // from here down is very similar to Y code except - // instead of 16 different pixels, its 8 pixels of U and 8 of V + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U @@ -1910,7 +1910,7 @@ __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned - // step 3 - store 8 U and 8 V values + // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] @@ -2927,7 +2927,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, psrlw xmm0, 6 packuswb xmm0, xmm0 // G - // Step 2: Weave into ARGB + // Step 2: Weave into ARGB punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 // BGRA first 4 pixels @@ -2975,8 +2975,8 @@ __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf, vpsrlw ymm0, ymm0, 6 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 - // TODO(fbarchard): Weave alpha with unpack. - // Step 2: Weave into ARGB + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates vpermq ymm1, ymm1, 0xd8 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels @@ -4067,7 +4067,7 @@ __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, sub edx, esi sub edi, esi - // 8 pixel loop. + // 8 pixel loop. convertloop8: movq xmm0, qword ptr [esi] // alpha punpcklbw xmm0, xmm0 @@ -4123,7 +4123,7 @@ __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0, sub edx, esi sub edi, esi - // 32 pixel loop. + // 32 pixel loop. convertloop32: vmovdqu ymm0, [esi] // alpha vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 @@ -4183,7 +4183,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, sub ecx, 4 jl convertloop4b // less than 4 pixels? - // 4 pixel loop. + // 4 pixel loop. convertloop4: movdqu xmm3, [eax] // src argb lea eax, [eax + 16] @@ -4212,7 +4212,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, add ecx, 4 - 1 jl convertloop1b - // 1 pixel loop. + // 1 pixel loop. convertloop1: movd xmm3, [eax] // src argb lea eax, [eax + 4] @@ -5256,7 +5256,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, cvtps2dq xmm5, xmm5 // 0.16 fixed point packssdw xmm5, xmm5 // 16 bit shorts - // 4 pixel loop small blocks. + // 4 pixel loop small blocks. s4: // top left movdqu xmm0, [eax] @@ -5298,7 +5298,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, jmp l4b - // 4 pixel loop + // 4 pixel loop l4: // top left movdqu xmm0, [eax] @@ -5350,7 +5350,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5392,7 +5392,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, test edx, 15 jne l4b - // 4 pixel loop + // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -5438,7 +5438,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. lea eax, [eax + 4] @@ -5481,7 +5481,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, sub ecx, 4 jl l4b - // setup for 4 pixel loop + // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride movdqa xmm0, xmm2 // x0, y0, x1, y1 @@ -5493,7 +5493,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, addps xmm3, xmm4 addps xmm4, xmm4 // dudv *= 4 - // 4 pixel loop + // 4 pixel loop l4: cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2 @@ -5524,7 +5524,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: cvttps2dq xmm0, xmm2 // x, y float to int packssdw xmm0, xmm0 // x, y as shorts @@ -5598,7 +5598,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] @@ -5608,7 +5608,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5638,7 +5638,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - // Dispatch to specialized filters if applicable. + // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 @@ -5678,7 +5678,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, jg xloop jmp xloop99 - // Blend 50 / 50. + // Blend 50 / 50. xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] @@ -5689,7 +5689,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, jg xloop50 jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. + // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] movdqu [esi + edi], xmm0 @@ -5784,7 +5784,7 @@ __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, cmp ebx, 0x02010003 je shuf_2103 - // TODO(fbarchard): Use one source pointer and 3 offsets. + // TODO(fbarchard): Use one source pointer and 3 offsets. shuf_any1: movzx ebx, byte ptr [esi] movzx ebx, byte ptr [eax + ebx] @@ -5971,7 +5971,7 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - // 2 pixel loop. + // 2 pixel loop. convertloop: // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel @@ -6072,7 +6072,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, pxor xmm5, xmm5 sub edx, eax - // 8 pixel loop. + // 8 pixel loop. convertloop: movdqu xmm2, xmmword ptr [eax] // 8 shorts add eax, 16 @@ -6110,7 +6110,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, vpxor ymm5, ymm5, ymm5 sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vmovdqu ymm2, [eax] // 16 shorts add eax, 32 @@ -6144,7 +6144,7 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src, mov ecx, [esp + 16] /* width */ sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts @@ -6252,7 +6252,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, psllw xmm4, 8 pxor xmm5, xmm5 - // 4 pixel loop. + // 4 pixel loop. convertloop: movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 01478b5fc..146feff0c 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -26,19 +26,19 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, uint8* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into v0, odd into v1 - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); + asm volatile( + "1: \n" + // load even pixels into v0, odd into v1 + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); } // Read 32x1 average down and write 16x1. @@ -47,22 +47,24 @@ void ScaleRowDown2Linear_NEON(const uint8* src_ptr, uint8* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc - "subs %w2, %w2, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // add adjacent - "uaddlp v1.8h, v1.16b \n" - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #1 \n" - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1" // Clobber List - ); + asm volatile( + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post + // inc + "subs %w2, %w2, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // add adjacent + "uaddlp v1.8h, v1.16b \n" + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and + // pack + "rshrn2 v0.16b, v1.8h, #1 \n" + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); } // Read 32x2 average down and write 16x1. @@ -70,28 +72,30 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc - "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %w3, %w3, #16 \n" // 16 processed per loop - "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent - "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 - "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn2 v0.16b, v1.8h, #2 \n" - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(src_stride), // %1 - "+r"(dst), // %2 - "+r"(dst_width) // %3 - : - : "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc + "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc + "subs %w3, %w3, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent + "uaddlp v1.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + + // row1 + "uadalp v1.8h, v3.16b \n" + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and + // pack + "rshrn2 v0.16b, v1.8h, #2 \n" + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "v0", "v1", "v2", "v3" // Clobber List + ); } void ScaleRowDown4_NEON(const uint8* src_ptr, @@ -99,18 +103,17 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, uint8* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v2.8b}, [%1], #8 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v2.8b}, [%1], #8 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } void ScaleRowDown4Box_NEON(const uint8* src_ptr, @@ -120,30 +123,29 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, const uint8* src_ptr1 = src_ptr + src_stride; const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr3 = src_ptr + src_stride * 3; - asm volatile ( - "1: \n" - "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 - "ld1 {v1.16b}, [%2], #16 \n" - "ld1 {v2.16b}, [%3], #16 \n" - "ld1 {v3.16b}, [%4], #16 \n" - "subs %w5, %w5, #4 \n" - "uaddlp v0.8h, v0.16b \n" - "uadalp v0.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" - "uadalp v0.8h, v3.16b \n" - "addp v0.8h, v0.8h, v0.8h \n" - "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding - "st1 {v0.s}[0], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_ptr1), // %2 - "+r"(src_ptr2), // %3 - "+r"(src_ptr3), // %4 - "+r"(dst_width) // %5 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 + "ld1 {v1.16b}, [%2], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %w5, %w5, #4 \n" + "uaddlp v0.8h, v0.16b \n" + "uadalp v0.8h, v1.16b \n" + "uadalp v0.8h, v2.16b \n" + "uadalp v0.8h, v3.16b \n" + "addp v0.8h, v0.8h, v0.8h \n" + "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding + "st1 {v0.s}[0], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write @@ -154,118 +156,116 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, uint8* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %w2, %w2, #24 \n" - "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "subs %w2, %w2, #24 \n" + "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, + // v2 + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1", "v2", "v3", "memory", "cc"); } void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" - // filter src line 0 with src line 1 - // expand chars to shorts to allow for room - // when adding lines together - "ushll v16.8h, v4.8b, #0 \n" - "ushll v17.8h, v5.8b, #0 \n" - "ushll v18.8h, v6.8b, #0 \n" - "ushll v19.8h, v7.8b, #0 \n" + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "ushll v16.8h, v4.8b, #0 \n" + "ushll v17.8h, v5.8b, #0 \n" + "ushll v18.8h, v6.8b, #0 \n" + "ushll v19.8h, v7.8b, #0 \n" - // 3 * line_0 + line_1 - "umlal v16.8h, v0.8b, v20.8b \n" - "umlal v17.8h, v1.8b, v20.8b \n" - "umlal v18.8h, v2.8b, v20.8b \n" - "umlal v19.8h, v3.8b, v20.8b \n" + // 3 * line_0 + line_1 + "umlal v16.8h, v0.8b, v20.8b \n" + "umlal v17.8h, v1.8b, v20.8b \n" + "umlal v18.8h, v2.8b, v20.8b \n" + "umlal v19.8h, v3.8b, v20.8b \n" - // (3 * line_0 + line_1) >> 2 - "uqrshrn v0.8b, v16.8h, #2 \n" - "uqrshrn v1.8b, v17.8h, #2 \n" - "uqrshrn v2.8b, v18.8h, #2 \n" - "uqrshrn v3.8b, v19.8h, #2 \n" + // (3 * line_0 + line_1) >> 2 + "uqrshrn v0.8b, v16.8h, #2 \n" + "uqrshrn v1.8b, v17.8h, #2 \n" + "uqrshrn v2.8b, v18.8h, #2 \n" + "uqrshrn v3.8b, v19.8h, #2 \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v16.8h, v1.8b, #0 \n" - "umlal v16.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v16.8h, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v16.8h, v1.8b, #0 \n" + "umlal v16.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v16.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v16.8h, v2.8b, #0 \n" - "umlal v16.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v16.8h, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v16.8h, v2.8b, #0 \n" + "umlal v16.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v16.8h, #2 \n" - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", - "v20", "memory", "cc" - ); + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "memory", "cc"); } void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { - asm volatile ( - "movi v20.8b, #3 \n" - "add %3, %3, %0 \n" - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %w2, %w2, #24 \n" - // average src line 0 with src line 1 - "urhadd v0.8b, v0.8b, v4.8b \n" - "urhadd v1.8b, v1.8b, v5.8b \n" - "urhadd v2.8b, v2.8b, v6.8b \n" - "urhadd v3.8b, v3.8b, v7.8b \n" + asm volatile( + "movi v20.8b, #3 \n" + "add %3, %3, %0 \n" + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 + "subs %w2, %w2, #24 \n" + // average src line 0 with src line 1 + "urhadd v0.8b, v0.8b, v4.8b \n" + "urhadd v1.8b, v1.8b, v5.8b \n" + "urhadd v2.8b, v2.8b, v6.8b \n" + "urhadd v3.8b, v3.8b, v7.8b \n" - // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "ushll v4.8h, v1.8b, #0 \n" - "umlal v4.8h, v0.8b, v20.8b \n" - "uqrshrn v0.8b, v4.8h, #2 \n" + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "ushll v4.8h, v1.8b, #0 \n" + "umlal v4.8h, v0.8b, v20.8b \n" + "uqrshrn v0.8b, v4.8h, #2 \n" - // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "urhadd v1.8b, v1.8b, v2.8b \n" + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "urhadd v1.8b, v1.8b, v2.8b \n" - // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "ushll v4.8h, v2.8b, #0 \n" - "umlal v4.8h, v3.8b, v20.8b \n" - "uqrshrn v2.8b, v4.8h, #2 \n" + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "ushll v4.8h, v2.8b, #0 \n" + "umlal v4.8h, v3.8b, v20.8b \n" + "uqrshrn v2.8b, v4.8h, #2 \n" - "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc" - ); + "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); } static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; @@ -282,21 +282,20 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, uint8* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( - "ld1 {v3.16b}, [%3] \n" - "1: \n" - "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %w2, %w2, #12 \n" - "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" - "st1 {v2.8b}, [%1], #8 \n" - "st1 {v2.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(&kShuf38) // %3 - : "v0", "v1", "v2", "v3", "memory", "cc" - ); + asm volatile( + "ld1 {v3.16b}, [%3] \n" + "1: \n" + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #12 \n" + "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" + "st1 {v2.8b}, [%1], #8 \n" + "st1 {v2.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(&kShuf38) // %3 + : "v0", "v1", "v2", "v3", "memory", "cc"); } // 32x3 -> 12x1 @@ -307,118 +306,117 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, const uint8* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - "ld1 {v29.8h}, [%5] \n" - "ld1 {v30.16b}, [%6] \n" - "ld1 {v31.8h}, [%7] \n" - "add %2, %2, %0 \n" - "1: \n" + asm volatile( + "ld1 {v29.8h}, [%5] \n" + "ld1 {v30.16b}, [%6] \n" + "ld1 {v31.8h}, [%7] \n" + "add %2, %2, %0 \n" + "1: \n" - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %w4, %w4, #12 \n" + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %w4, %w4, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v20.8b, v0.8b, v1.8b \n" - "trn2 v21.8b, v0.8b, v1.8b \n" - "trn1 v22.8b, v4.8b, v5.8b \n" - "trn2 v23.8b, v4.8b, v5.8b \n" - "trn1 v24.8b, v16.8b, v17.8b \n" - "trn2 v25.8b, v16.8b, v17.8b \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v20.8b, v0.8b, v1.8b \n" + "trn2 v21.8b, v0.8b, v1.8b \n" + "trn1 v22.8b, v4.8b, v5.8b \n" + "trn2 v23.8b, v4.8b, v5.8b \n" + "trn1 v24.8b, v16.8b, v17.8b \n" + "trn2 v25.8b, v16.8b, v17.8b \n" - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" - "trn1 v16.8b, v18.8b, v19.8b \n" - "trn2 v17.8b, v18.8b, v19.8b \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" + "trn1 v16.8b, v18.8b, v19.8b \n" + "trn2 v17.8b, v18.8b, v19.8b \n" - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v20.4h, v20.8b \n" - "uaddlp v21.4h, v21.8b \n" - "uaddlp v22.4h, v22.8b \n" - "uaddlp v23.4h, v23.8b \n" - "uaddlp v24.4h, v24.8b \n" - "uaddlp v25.4h, v25.8b \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v20.4h, v20.8b \n" + "uaddlp v21.4h, v21.8b \n" + "uaddlp v22.4h, v22.8b \n" + "uaddlp v23.4h, v23.8b \n" + "uaddlp v24.4h, v24.8b \n" + "uaddlp v25.4h, v25.8b \n" - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" - "uaddlp v17.4h, v17.8b \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" + "uaddlp v17.4h, v17.8b \n" - // combine source lines - "add v20.4h, v20.4h, v22.4h \n" - "add v21.4h, v21.4h, v23.4h \n" - "add v20.4h, v20.4h, v24.4h \n" - "add v21.4h, v21.4h, v25.4h \n" - "add v2.4h, v1.4h, v5.4h \n" - "add v2.4h, v2.4h, v17.4h \n" + // combine source lines + "add v20.4h, v20.4h, v22.4h \n" + "add v21.4h, v21.4h, v23.4h \n" + "add v20.4h, v20.4h, v24.4h \n" + "add v21.4h, v21.4h, v25.4h \n" + "add v2.4h, v1.4h, v5.4h \n" + "add v2.4h, v2.4h, v17.4h \n" - // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] - // + s[6 + st * 1] + s[7 + st * 1] - // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "sqrdmulh v2.8h, v2.8h, v29.8h \n" - "xtn v2.8b, v2.8h \n" + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] + // + s[6 + st * 1] + s[7 + st * 1] + // + s[6 + st * 2] + s[7 + st * 2]) / 6 + "sqrdmulh v2.8h, v2.8h, v29.8h \n" + "xtn v2.8b, v2.8h \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "ushll v16.8h, v16.8b, #0 \n" - "uaddl v0.8h, v0.8b, v4.8b \n" + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + "ushll v16.8h, v16.8b, #0 \n" + "uaddl v0.8h, v0.8b, v4.8b \n" - // combine source lines - "add v0.8h, v0.8h, v16.8h \n" + // combine source lines + "add v0.8h, v0.8h, v16.8h \n" - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" - // 0+1+2, 3+4+5 - "add v20.8h, v20.8h, v0.8h \n" - "add v21.8h, v21.8h, v4.8h \n" + // 0+1+2, 3+4+5 + "add v20.8h, v20.8h, v0.8h \n" + "add v21.8h, v21.8h, v4.8h \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v20.8h, v31.8h \n" - "sqrdmulh v1.8h, v21.8h, v31.8h \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v20.8h, v31.8h \n" + "sqrdmulh v1.8h, v21.8h, v31.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" + // Align for table lookup, vtbl requires registers to + // be adjacent + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(src_ptr1), // %3 - "+r"(dst_width) // %4 - : "r"(&kMult38_Div6), // %5 - "r"(&kShuf38_2), // %6 - "r"(&kMult38_Div9) // %7 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", - "v30", "v31", "memory", "cc" - ); + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 + : "r"(&kMult38_Div6), // %5 + "r"(&kShuf38_2), // %6 + "r"(&kMult38_Div9) // %7 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", + "memory", "cc"); } // 32x2 -> 12x1 @@ -428,99 +426,98 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; - asm volatile ( - "ld1 {v30.8h}, [%4] \n" - "ld1 {v31.16b}, [%5] \n" - "add %2, %2, %0 \n" - "1: \n" + asm volatile( + "ld1 {v30.8h}, [%4] \n" + "ld1 {v31.16b}, [%5] \n" + "add %2, %2, %0 \n" + "1: \n" - // 00 40 01 41 02 42 03 43 - // 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 - // 30 70 31 71 32 72 33 73 - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %w3, %w3, #12 \n" + // 00 40 01 41 02 42 03 43 + // 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 + // 30 70 31 71 32 72 33 73 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" + "subs %w3, %w3, #12 \n" - // Shuffle the input data around to get align the data - // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 - // 00 10 01 11 02 12 03 13 - // 40 50 41 51 42 52 43 53 - "trn1 v16.8b, v0.8b, v1.8b \n" - "trn2 v17.8b, v0.8b, v1.8b \n" - "trn1 v18.8b, v4.8b, v5.8b \n" - "trn2 v19.8b, v4.8b, v5.8b \n" + // Shuffle the input data around to get align the data + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 + // 00 10 01 11 02 12 03 13 + // 40 50 41 51 42 52 43 53 + "trn1 v16.8b, v0.8b, v1.8b \n" + "trn2 v17.8b, v0.8b, v1.8b \n" + "trn1 v18.8b, v4.8b, v5.8b \n" + "trn2 v19.8b, v4.8b, v5.8b \n" - // 20 30 21 31 22 32 23 33 - // 60 70 61 71 62 72 63 73 - "trn1 v0.8b, v2.8b, v3.8b \n" - "trn2 v1.8b, v2.8b, v3.8b \n" - "trn1 v4.8b, v6.8b, v7.8b \n" - "trn2 v5.8b, v6.8b, v7.8b \n" + // 20 30 21 31 22 32 23 33 + // 60 70 61 71 62 72 63 73 + "trn1 v0.8b, v2.8b, v3.8b \n" + "trn2 v1.8b, v2.8b, v3.8b \n" + "trn1 v4.8b, v6.8b, v7.8b \n" + "trn2 v5.8b, v6.8b, v7.8b \n" - // 00+10 01+11 02+12 03+13 - // 40+50 41+51 42+52 43+53 - "uaddlp v16.4h, v16.8b \n" - "uaddlp v17.4h, v17.8b \n" - "uaddlp v18.4h, v18.8b \n" - "uaddlp v19.4h, v19.8b \n" + // 00+10 01+11 02+12 03+13 + // 40+50 41+51 42+52 43+53 + "uaddlp v16.4h, v16.8b \n" + "uaddlp v17.4h, v17.8b \n" + "uaddlp v18.4h, v18.8b \n" + "uaddlp v19.4h, v19.8b \n" - // 60+70 61+71 62+72 63+73 - "uaddlp v1.4h, v1.8b \n" - "uaddlp v5.4h, v5.8b \n" + // 60+70 61+71 62+72 63+73 + "uaddlp v1.4h, v1.8b \n" + "uaddlp v5.4h, v5.8b \n" - // combine source lines - "add v16.4h, v16.4h, v18.4h \n" - "add v17.4h, v17.4h, v19.4h \n" - "add v2.4h, v1.4h, v5.4h \n" + // combine source lines + "add v16.4h, v16.4h, v18.4h \n" + "add v17.4h, v17.4h, v19.4h \n" + "add v2.4h, v1.4h, v5.4h \n" - // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "uqrshrn v2.8b, v2.8h, #2 \n" + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 + "uqrshrn v2.8b, v2.8h, #2 \n" - // Shuffle 2,3 reg around so that 2 can be added to the - // 0,1 reg and 3 can be added to the 4,5 reg. This - // requires expanding from u8 to u16 as the 0,1 and 4,5 - // registers are already expanded. Then do transposes - // to get aligned. - // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 + // Shuffle 2,3 reg around so that 2 can be added to the + // 0,1 reg and 3 can be added to the 4,5 reg. This + // requires expanding from u8 to u16 as the 0,1 and 4,5 + // registers are already expanded. Then do transposes + // to get aligned. + // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - // combine source lines - "uaddl v0.8h, v0.8b, v4.8b \n" + // combine source lines + "uaddl v0.8h, v0.8b, v4.8b \n" - // xx 20 xx 21 xx 22 xx 23 - // xx 30 xx 31 xx 32 xx 33 - "trn1 v1.8h, v0.8h, v0.8h \n" - "trn2 v4.8h, v0.8h, v0.8h \n" - "xtn v0.4h, v1.4s \n" - "xtn v4.4h, v4.4s \n" + // xx 20 xx 21 xx 22 xx 23 + // xx 30 xx 31 xx 32 xx 33 + "trn1 v1.8h, v0.8h, v0.8h \n" + "trn2 v4.8h, v0.8h, v0.8h \n" + "xtn v0.4h, v1.4s \n" + "xtn v4.4h, v4.4s \n" - // 0+1+2, 3+4+5 - "add v16.8h, v16.8h, v0.8h \n" - "add v17.8h, v17.8h, v4.8h \n" + // 0+1+2, 3+4+5 + "add v16.8h, v16.8h, v0.8h \n" + "add v17.8h, v17.8h, v4.8h \n" - // Need to divide, but can't downshift as the the value - // isn't a power of 2. So multiply by 65536 / n - // and take the upper 16 bits. - "sqrdmulh v0.8h, v16.8h, v30.8h \n" - "sqrdmulh v1.8h, v17.8h, v30.8h \n" + // Need to divide, but can't downshift as the the value + // isn't a power of 2. So multiply by 65536 / n + // and take the upper 16 bits. + "sqrdmulh v0.8h, v16.8h, v30.8h \n" + "sqrdmulh v1.8h, v17.8h, v30.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent + // Align for table lookup, vtbl requires registers to + // be adjacent - "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" + "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" - "st1 {v3.8b}, [%1], #8 \n" - "st1 {v3.s}[2], [%1], #4 \n" - "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(tmp_src_stride), // %2 - "+r"(dst_width) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", - "v18", "v19", "v30", "v31", "memory", "cc" - ); + "st1 {v3.8b}, [%1], #8 \n" + "st1 {v3.s}[2], [%1], #4 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", + "v19", "v30", "v31", "memory", "cc"); } void ScaleAddRows_NEON(const uint8* src_ptr, @@ -529,32 +526,32 @@ void ScaleAddRows_NEON(const uint8* src_ptr, int src_width, int src_height) { const uint8* src_tmp; - asm volatile ( - "1: \n" - "mov %0, %1 \n" - "mov w12, %w5 \n" - "eor v2.16b, v2.16b, v2.16b \n" - "eor v3.16b, v3.16b, v3.16b \n" - "2: \n" - // load 16 pixels into q0 - "ld1 {v0.16b}, [%0], %3 \n" - "uaddw2 v3.8h, v3.8h, v0.16b \n" - "uaddw v2.8h, v2.8h, v0.8b \n" - "subs w12, w12, #1 \n" - "b.gt 2b \n" - "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels - "add %1, %1, #16 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop - "b.gt 1b \n" - : "=&r"(src_tmp), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_ptr), // %2 - "+r"(src_stride), // %3 - "+r"(src_width), // %4 - "+r"(src_height) // %5 - : - : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "=&r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); } // clang-format off @@ -644,84 +641,83 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, int dst_width, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; - asm volatile ( - "cmp %w4, #0 \n" - "b.eq 100f \n" - "add %2, %2, %1 \n" - "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" - "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" + asm volatile( + "cmp %w4, #0 \n" + "b.eq 100f \n" + "add %2, %2, %1 \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" + "cmp %w4, #128 \n" + "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" - "dup v5.8b, %w4 \n" - "dup v4.8b, %w5 \n" - // General purpose row blend. - "1: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "umull v6.8h, v0.8b, v4.8b \n" - "umull2 v7.8h, v0.16b, v4.16b \n" - "umlal v6.8h, v1.8b, v5.8b \n" - "umlal2 v7.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v6.8h, #8 \n" - "rshrn2 v0.16b, v7.8h, #8 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 1b \n" - "b 99f \n" + "dup v5.8b, %w4 \n" + "dup v4.8b, %w5 \n" + // General purpose row blend. + "1: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "umull v6.8h, v0.8b, v4.8b \n" + "umull2 v7.8h, v0.16b, v4.16b \n" + "umlal v6.8h, v1.8b, v5.8b \n" + "umlal2 v7.8h, v1.16b, v5.16b \n" + "rshrn v0.8b, v6.8h, #8 \n" + "rshrn2 v0.16b, v7.8h, #8 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 1b \n" + "b 99f \n" - // Blend 25 / 75. - "25: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" + // Blend 25 / 75. + "25: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" - // Blend 50 / 50. - "50: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 50b \n" - "b 99f \n" + // Blend 50 / 50. + "50: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 50b \n" + "b 99f \n" - // Blend 75 / 25. - "75: \n" - "ld1 {v1.16b}, [%1], #16 \n" - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" + // Blend 75 / 25. + "75: \n" + "ld1 {v1.16b}, [%1], #16 \n" + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. - "100: \n" - "ld1 {v0.16b}, [%1], #16 \n" - "subs %w3, %w3, #16 \n" - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 100b \n" + // Blend 100 / 0 - Copy row unchanged. + "100: \n" + "ld1 {v0.16b}, [%1], #16 \n" + "subs %w3, %w3, #16 \n" + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 100b \n" - "99: \n" - "st1 {v0.b}[15], [%0] \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(src_stride), // %2 - "+r"(dst_width), // %3 - "+r"(source_y_fraction),// %4 - "+r"(y_fraction) // %5 - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc" - ); + "99: \n" + "st1 {v0.b}[15], [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction), // %4 + "+r"(y_fraction) // %5 + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); } void ScaleARGBRowDown2_NEON(const uint8* src_ptr, @@ -729,21 +725,21 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, uint8* dst, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load even pixels into q0, odd into q1 - "ld2 {v0.4s, v1.4s}, [%0], #32 \n" - "ld2 {v2.4s, v3.4s}, [%0], #32 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop - "st1 {v1.16b}, [%1], #16 \n" // store odd pixels - "st1 {v3.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (dst), // %1 - "+r" (dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + // load even pixels into q0, odd into q1 + "ld2 {v0.4s, v1.4s}, [%0], #32 \n" + "ld2 {v2.4s, v3.4s}, [%0], #32 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v1.16b}, [%1], #16 \n" // store odd pixels + "st1 {v3.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); } void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, @@ -751,61 +747,65 @@ void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, uint8* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - // load 8 ARGB pixels. - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #1 \n" - "rshrn v2.8b, v2.8h, #1 \n" - "rshrn v3.8b, v3.8h, #1 \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List - ); + asm volatile( + "1: \n" + // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and + // pack + "rshrn v1.8b, v1.8h, #1 \n" + "rshrn v2.8b, v2.8h, #1 \n" + "rshrn v3.8b, v3.8h, #1 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); } void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { - asm volatile ( - // change the stride to row 2 pointer - "add %1, %1, %0 \n" - "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. - "subs %w3, %w3, #8 \n" // 8 processed per loop. - "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 more ARGB pixels. - "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. - "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. - "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and pack - "rshrn v1.8b, v1.8h, #2 \n" - "rshrn v2.8b, v2.8h, #2 \n" - "rshrn v3.8b, v3.8h, #2 \n" - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" - "b.gt 1b \n" - : "+r" (src_ptr), // %0 - "+r" (src_stride), // %1 - "+r" (dst), // %2 - "+r" (dst_width) // %3 - : - : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19" - ); + asm volatile( + // change the stride to row 2 pointer + "add %1, %1, %0 \n" + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB + // pixels. + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 + // more ARGB + // pixels. + "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. + "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and + // pack + "rshrn v1.8b, v1.8h, #2 \n" + "rshrn v2.8b, v2.8h, #2 \n" + "rshrn v3.8b, v3.8h, #2 \n" + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stride), // %1 + "+r"(dst), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } // Reads 4 pixels at a time. @@ -816,21 +816,20 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, uint8* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( - "1: \n" - "ld1 {v0.s}[0], [%0], %3 \n" - "ld1 {v0.s}[1], [%0], %3 \n" - "ld1 {v0.s}[2], [%0], %3 \n" - "ld1 {v0.s}[3], [%0], %3 \n" - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%1], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(dst_width) // %2 - : "r"((int64)(src_stepx * 4)) // %3 - : "memory", "cc", "v0" - ); + asm volatile( + "1: \n" + "ld1 {v0.s}[0], [%0], %3 \n" + "ld1 {v0.s}[1], [%0], %3 \n" + "ld1 {v0.s}[2], [%0], %3 \n" + "ld1 {v0.s}[3], [%0], %3 \n" + "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : "r"((int64)(src_stepx * 4)) // %3 + : "memory", "cc", "v0"); } // Reads 4 pixels at a time. @@ -842,41 +841,41 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stepx, uint8* dst_argb, int dst_width) { - asm volatile ( - "add %1, %1, %0 \n" - "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 - "ld1 {v1.8b}, [%1], %4 \n" - "ld1 {v2.8b}, [%0], %4 \n" - "ld1 {v3.8b}, [%1], %4 \n" - "ld1 {v4.8b}, [%0], %4 \n" - "ld1 {v5.8b}, [%1], %4 \n" - "ld1 {v6.8b}, [%0], %4 \n" - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %w3, %w3, #4 \n" // 4 pixels per loop. - "st1 {v0.16b}, [%2], #16 \n" - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_stride), // %1 - "+r"(dst_argb), // %2 - "+r"(dst_width) // %3 - : "r"((int64)(src_stepx * 4)) // %4 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" - ); + asm volatile( + "add %1, %1, %0 \n" + "1: \n" + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> + // 2x1 + "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. + "st1 {v0.16b}, [%2], #16 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride), // %1 + "+r"(dst_argb), // %2 + "+r"(dst_width) // %3 + : "r"((int64)(src_stepx * 4)) // %4 + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // clang-format off @@ -898,30 +897,25 @@ void ScaleARGBCols_NEON(uint8* dst_argb, int64 x64 = (int64)x; int64 dx64 = (int64)dx; int64 tmp64; - asm volatile ( - "1: \n" - LOAD1_DATA32_LANE(v0, 0) - LOAD1_DATA32_LANE(v0, 1) - LOAD1_DATA32_LANE(v0, 2) - LOAD1_DATA32_LANE(v0, 3) - LOAD1_DATA32_LANE(v1, 0) - LOAD1_DATA32_LANE(v1, 1) - LOAD1_DATA32_LANE(v1, 2) - LOAD1_DATA32_LANE(v1, 3) + asm volatile( + "1: \n" LOAD1_DATA32_LANE( + v0, 0) LOAD1_DATA32_LANE(v0, 1) LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) LOAD1_DATA32_LANE(v1, 0) LOAD1_DATA32_LANE( + v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3) - "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop - "b.gt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 - "+r"(x64), // %3 - "+r"(dx64), // %4 - "=&r"(tmp64), // %5 - "+r"(src_tmp) // %6 - : - : "memory", "cc", "v0", "v1" - ); + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per + // loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "=&r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1"); } #undef LOAD1_DATA32_LANE diff --git a/source/scale_win.cc b/source/scale_win.cc index 0c5b3a1e1..3e9331283 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -816,7 +816,7 @@ __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, mov ecx, [esp + 12] // src_width pxor xmm5, xmm5 - // sum rows + // sum rows xloop: movdqu xmm3, [eax] // read 16 bytes lea eax, [eax + 16] @@ -847,7 +847,7 @@ __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, mov ecx, [esp + 12] // src_width vpxor ymm5, ymm5, ymm5 - // sum rows + // sum rows xloop: vmovdqu ymm3, [eax] // read 32 bytes lea eax, [eax + 32] @@ -939,7 +939,7 @@ __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx psrlw xmm2, 9 // 7 bit fractions. @@ -1194,7 +1194,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, sub ecx, 4 jl xloop49 - // 4 Pixel loop. + // 4 Pixel loop. xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels @@ -1218,7 +1218,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, test ecx, 2 je xloop29 - // 2 Pixels. + // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. @@ -1231,7 +1231,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, test ecx, 1 je xloop99 - // 1 Pixels. + // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 xloop99: @@ -1309,7 +1309,7 @@ __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, add ecx, 2 - 1 jl xloop99 - // 1 pixel remainder + // 1 pixel remainder psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels pshufb xmm2, xmm5 // 00000000 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 28d557a59..a0d7881ed 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -11,6 +11,9 @@ #include #include +// row.h defines SIMD_ALIGNED, overriding unit_test.h +#include "libyuv/row.h" /* For ScaleSumSamples_Neon */ + #include "../unit_test/unit_test.h" #include "libyuv/compare.h" #include "libyuv/convert.h" @@ -2518,4 +2521,146 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +float TestScaleSumSamples(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + float scale, + bool opt) { + int i, j; + float max_c, max_opt; + const int y_plane_size = benchmark_width * benchmark_height * 4; + + align_buffer_page_end(orig_y, y_plane_size * 3); + uint8* dst_opt = orig_y + y_plane_size; + uint8* dst_c = orig_y + y_plane_size * 2; + + // Randomize works but may contain some denormals affecting performance. + // MemRandomize(orig_y, y_plane_size); + for (i = 0; i < y_plane_size / 4; ++i) { + (reinterpret_cast(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; + } + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 1, y_plane_size); + + // Disable all optimizations. + max_c = ScaleSumSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_c), scale, + benchmark_width * benchmark_height); + + // Enable optimizations. + for (j = 0; j < benchmark_iterations; j++) { +#ifdef HAS_SCALESUMSAMPLES_NEON + if (opt) { + max_opt = ScaleSumSamples_NEON(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); + + } else { + max_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); + } +#else + max_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#endif + } + + float max_diff = 0; + for (i = 0; i < y_plane_size / 4; ++i) { + float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - + (reinterpret_cast(dst_opt)[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(orig_y); + return max_diff; +} + +TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) { + float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, false); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) { + float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, true); + EXPECT_EQ(0, diff); +} + +float TestScaleSamples(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + float scale, + bool opt) { + int i, j; + const int y_plane_size = benchmark_width * benchmark_height * 4; + + align_buffer_page_end(orig_y, y_plane_size * 3); + uint8* dst_opt = orig_y + y_plane_size; + uint8* dst_c = orig_y + y_plane_size * 2; + + // Randomize works but may contain some denormals affecting performance. + // MemRandomize(orig_y, y_plane_size); + for (i = 0; i < y_plane_size / 4; ++i) { + (reinterpret_cast(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f; + } + + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 1, y_plane_size); + + // Disable all optimizations. + ScaleSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_c), scale, + benchmark_width * benchmark_height); + + // Enable optimizations. + for (j = 0; j < benchmark_iterations; j++) { +#ifdef HAS_SCALESAMPLES_NEON + if (opt) { + max_opt = ScaleSamples_NEON(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); + + } else { + ScaleSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); + } +#else + ScaleSamples_C(reinterpret_cast(orig_y), + reinterpret_cast(dst_opt), scale, + benchmark_width * benchmark_height); +#endif + } + + float max_diff = 0; + for (i = 0; i < y_plane_size / 4; ++i) { + float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - + (reinterpret_cast(dst_opt)[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(orig_y); + return max_diff; +} + +TEST_F(LibYUVPlanarTest, TestScaleSamples_C) { + float diff = TestScaleSamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, false); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) { + float diff = TestScaleSamples(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, true); + EXPECT_EQ(0, diff); +} + } // namespace libyuv diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index 416faa2fb..87201b11d 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -36,6 +36,9 @@ static __inline int Abs(int v) { return v >= 0 ? v : -v; } +static __inline float FAbs(float v) { + return v >= 0 ? v : -v; +} #define OFFBY 0 // Scaling uses 16.16 fixed point to step thru the source image, so a @@ -70,8 +73,11 @@ static inline bool SizeValid(int src_width, uint8* var; \ uint8* var##_mem; \ var##_mem = reinterpret_cast(malloc(((size) + 4095 + 63) & ~4095)); \ - var = (uint8*)((intptr_t)(var##_mem + (((size) + 4095 + 63) & /* NOLINT */ \ - ~4095) - (size)) & ~63); + var = (uint8*)((intptr_t)(var##_mem + \ + (((size) + 4095 + 63) & /* NOLINT */ \ + ~4095) - \ + (size)) & \ + ~63); #define free_aligned_buffer_page_end(var) \ free(var##_mem); \