diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index aae7232b7..63ea8c841 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -51,6 +51,7 @@ extern "C" { #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEADDROWS_NEON #endif // The following are available on Mips platforms: @@ -305,6 +306,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height); + void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/source/scale.cc b/source/scale.cc index 6daa02dcb..7db3b81ce 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -758,6 +758,12 @@ static void ScalePlaneBox(int src_width, int src_height, } #endif +#if defined(HAS_SCALEADDROWS_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) { + ScaleAddRows = ScaleAddRows_NEON; + } +#endif + for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 7eb988b65..bcfe6ab72 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -541,6 +541,40 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + "mov r12, %5 \n" + "veor q2, q2, q2 \n" + "veor q3, q3, q3 \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "vld1.8 {q0}, [%0], %3 \n" + "vaddw.u8 q3, q3, d1 \n" + "vaddw.u8 q2, q2, d0 \n" + "subs r12, r12, #1 \n" + "bgt 2b \n" + MEMACCESS(2) + "vst1.16 {q2, q3}, [%2]! \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + ); +} + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index dcf89c292..c9b6416dc 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -545,6 +545,39 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + "1: \n" + "mov %0, %1 \n" + "mov x12, %5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs x12, x12, #1 \n" + "b.gt 2b \n" + MEMACCESS(2) + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "x12", "v0", "v1", "v2", "v3" // Clobber List + ); +} + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride,