diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ae9f595c8..e51155c08 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -441,6 +441,7 @@ extern "C" { #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_BYTETOFLOATROW_NEON +#define HAS_CONVERT16TO8ROW_NEON #define HAS_COPYROW_NEON #define HAS_DETILEROW_NEON #define HAS_DETILESPLITUVROW_NEON @@ -2596,6 +2597,14 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, int width); +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); +void Convert16To8Row_Any_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width); void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 53e746794..c662cfef6 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -147,6 +147,14 @@ void Convert16To8Plane(const uint16_t* src_y, height = 1; src_stride_y = dst_stride_y = 0; } +#if defined(HAS_CONVERT16TO8ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Convert16To8Row = Convert16To8Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + Convert16To8Row = Convert16To8Row_NEON; + } + } +#endif #if defined(HAS_CONVERT16TO8ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { Convert16To8Row = Convert16To8Row_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 2d30e0a56..089e518af 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1481,6 +1481,15 @@ ANY11C(Convert16To8Row_Any_AVX2, uint8_t, 31) #endif +#ifdef HAS_CONVERT16TO8ROW_NEON +ANY11C(Convert16To8Row_Any_NEON, + Convert16To8Row_NEON, + 2, + 1, + uint16_t, + uint8_t, + 15) +#endif #ifdef HAS_CONVERT8TO16ROW_SSE2 ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, diff --git a/source/row_neon.cc b/source/row_neon.cc index cda171ada..8ba71d07e 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3599,7 +3599,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "vdup.16 q2, %2 \n" + "vdup.16 q2, %3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" @@ -3607,13 +3607,12 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, "vmul.u16 q1, q1, q2 \n" "vst1.16 {q0}, [%1]! \n" "vst1.16 {q1}, [%1]! \n" - "subs %3, %3, #16 \n" // 16 src pixels per loop + "subs %2, %2, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : + "+r"(width) // %2 + : "r"(scale) // %3 : "cc", "memory", "q0", "q1", "q2"); } @@ -3622,7 +3621,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "vdup.16 q0, %2 \n" + "vdup.16 q0, %3 \n" "1: \n" "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" @@ -3640,6 +3639,34 @@ void DivideRow_16_NEON(const uint16_t* src_y, "vmovn.u32 d5, q2 \n" "vst1.16 {q1}, [%1]! \n" "vst1.16 {q2}, [%1]! \n" + "subs %2, %2, #16 \n" // 16 src pixels per loop + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + asm volatile( + "vdup.16 q2, %2 \n" + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vqdmulh.s16 q0, q0, q2 \n" + "vqdmulh.s16 q1, q1, q2 \n" + "vqshrn.u16 d0, q0, #1 \n" + "vqshrn.u16 d1, q1, #1 \n" + "vst1.16 {q0}, [%1]! \n" "subs %3, %3, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 @@ -3647,7 +3674,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, "+r"(scale), // %2 "+r"(width) // %3 : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); + : "cc", "memory", "q0", "q1", "q2"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 27723810d..8d43d5940 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -4070,21 +4070,19 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "dup v2.8h, %w2 \n" + "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" "mul v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "mul v1.8h, v1.8h, v2.8h \n" - "stp q0, q1, [%1] \n" // store 16 pixels - "add %1, %1, #32 \n" - "subs %w3, %w3, #16 \n" // 16 src pixels per loop + "stp q0, q1, [%1], #32 \n" // store 16 pixels + "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : + "+r"(width) // %2 + : "r"(scale) // %3 : "cc", "memory", "v0", "v1", "v2"); } @@ -4093,7 +4091,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, int scale, int width) { asm volatile( - "dup v0.8h, %w2 \n" + "dup v0.8h, %w3 \n" "1: \n" "ldp q1, q2, [%0], #32 \n" "ushll v3.4s, v1.4h, #0 \n" @@ -4109,18 +4107,44 @@ void DivideRow_16_NEON(const uint16_t* src_y, "shrn v4.4h, v4.4s, #16 \n" "shrn2 v3.8h, v1.4s, #16 \n" "shrn2 v4.8h, v2.4s, #16 \n" - "stp q3, q3, [%1] \n" // store 16 pixels - "add %1, %1, #32 \n" - "subs %w3, %w3, #16 \n" // 16 src pixels per loop + "stp q3, q3, [%1], #32 \n" // store 16 pixels + "subs %w2, %w2, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 - "+r"(scale), // %2 - "+r"(width) // %3 - : + "+r"(width) // %2 + : "r"(scale) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } +// Use scale to convert lsb formats to msb, depending how many bits there are: +// 32768 = 9 bits +// 16384 = 10 bits +// 4096 = 12 bits +// 256 = 16 bits +void Convert16To8Row_NEON(const uint16_t* src_y, + uint8_t* dst_y, + int scale, + int width) { + asm volatile( + "dup v2.8h, %w3 \n" + "1: \n" + "ldp q0, q1, [%0], #32 \n" + "sqdmulh v0.8h, v0.8h, v2.8h \n" + "sqdmulh v1.8h, v1.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 src pixels per loop + "uqshrn v0.8b, v0.8h, #1 \n" + "uqshrn2 v0.16b, v1.8h, #1 \n" + "str q0, [%1], #16 \n" // store 16 pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "v0", "v1", "v2"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus