diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 27aa04b22..aae7232b7 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -268,6 +268,9 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); diff --git a/source/scale.cc b/source/scale.cc index 482c5a61e..1da68b87c 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -55,7 +55,9 @@ static void ScalePlaneDown2(int src_width, int src_height, #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : + ScaleRowDown2Box_NEON); } #endif #if defined(HAS_SCALEROWDOWN2_SSE2) diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 7921219b5..7eb988b65 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -43,6 +43,30 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc + "subs %2, %2, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // add adjacent + "vpaddl.u8 q1, q1 \n" + "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #1 \n" + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index fb68b67d2..dcf89c292 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc + "subs %2, %2, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // add adjacent + "uaddlp v1.8h, v1.16b \n" + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #1 \n" + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) {