diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 1a46eef58..be21e07fb 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -203,6 +203,7 @@ extern "C" { #define HAS_I444TOARGBROW_AVX2 #define HAS_I411TOARGBROW_AVX2 #define HAS_J400TOARGBROW_AVX2 +#define HAS_J422TOARGBROW_AVX2 // TODO(fbarchard): Port to Neon #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565DITHERROW_AVX2 @@ -233,7 +234,6 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -#define HAS_J422TOARGBROW_AVX2 // The following require HAS_I422TOARGBROW_AVX2 #if defined(HAS_I422TOARGBROW_AVX2) diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 9660d1d9d..19affbe86 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -52,6 +52,7 @@ extern "C" { #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEADDROWS_NEON +#define HAS_SCALEFILTERCOLS_NEON #endif // The following are available on Mips platforms: @@ -311,6 +312,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height); +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + +void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/libyuv.gypi b/libyuv.gypi index ae54a44ef..8a99de567 100644 --- a/libyuv.gypi +++ b/libyuv.gypi @@ -55,6 +55,7 @@ 'source/row_win.cc', 'source/scale.cc', 'source/scale_argb.cc', + 'source/scale_any.cc', 'source/scale_common.cc', 'source/scale_mips.cc', 'source/scale_posix.cc', diff --git a/source/scale.cc b/source/scale.cc index 7db3b81ce..72b2c203d 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -928,6 +928,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_SSSE3; } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } #endif if (y > max_y) { y = max_y; @@ -1119,6 +1127,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_SSSE3; } +#endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; diff --git a/source/scale_any.cc b/source/scale_any.cc new file mode 100644 index 000000000..702bca263 --- /dev/null +++ b/source/scale_any.cc @@ -0,0 +1,42 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + int dst_width, int x, int dx) { \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, \ + dst_width & MASK, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#undef CANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 89d327562..43b8b5aee 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -575,6 +575,73 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add r12, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "vld2.8 {d6["#n"], d7["#n"]}, [r12] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int tmp[4] = {0, 1, 2, 3}; + asm volatile ( + ".p2align 2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q1, q1, q0 \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vshrn.s32 d18, q11, #16 \n" + "vshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + MEMACCESS(0) + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_ptr) // %0 + : "r"(src_ptr), // %1 + "r"(dst_width), // %2 + "r"(x), // %3 + "r"(dx), // %4 + "r"(tmp) // %5 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13" + ); +} + +#undef LOAD2_DATA8_LANE + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 413d005a2..faa859f7d 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -578,6 +578,72 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add x12, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + "ld2 {v4.b, v5.b}["#n"], [x12] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int tmp[4] = {0, 1, 2, 3}; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v1.4s, v1.4s, v0.4s \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "shrn v6.4h, v16.4s, #16 \n" + "shrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + MEMACCESS(0) + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_ptr) // %0 + : "r"(src_ptr), // %1 + "r"(dst_width), // %2 + "r"(static_cast(x)), // %3 + "r"(static_cast(dx)), // %4 + "r"(tmp) // %5 + : "memory", "cc", "x12", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v16", "v17" + ); +} + +#undef LOAD2_DATA8_LANE + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride,