mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Port box filter to NEON
Bug: libyuv:821 Change-Id: I4a6b9bee2c2fae199c73c9ec7ecb32bde37c1852 Tested: out/Release/libyuv_unittest --gtest_filter=*ScaleFrom1920x1080_Box --libyuv_width=160 --libyuv_height=90 --libyuv_repeat=1000 Reviewed-on: https://chromium-review.googlesource.com/c/1298598 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Miguel Casas <mcasas@chromium.org>
This commit is contained in:
parent
b416d36c89
commit
b36c86fdfe
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1721
|
||||
Version: 1722
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -58,6 +58,7 @@ extern "C" {
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_FIXEDDIV1_X86
|
||||
#define HAS_FIXEDDIV_X86
|
||||
#define HAS_SCALEADDROW_SSE2
|
||||
#define HAS_SCALEARGBCOLS_SSE2
|
||||
#define HAS_SCALEARGBCOLSUP2_SSE2
|
||||
#define HAS_SCALEARGBFILTERCOLS_SSSE3
|
||||
@ -69,7 +70,6 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN34_SSSE3
|
||||
#define HAS_SCALEROWDOWN38_SSSE3
|
||||
#define HAS_SCALEROWDOWN4_SSSE3
|
||||
#define HAS_SCALEADDROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
@ -86,7 +86,9 @@ extern "C" {
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_SCALEADDROW_NEON
|
||||
#define HAS_SCALEARGBCOLS_NEON
|
||||
#define HAS_SCALEARGBFILTERCOLS_NEON
|
||||
#define HAS_SCALEARGBROWDOWN2_NEON
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_NEON
|
||||
#define HAS_SCALEFILTERCOLS_NEON
|
||||
@ -94,7 +96,6 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN34_NEON
|
||||
#define HAS_SCALEROWDOWN38_NEON
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#define HAS_SCALEARGBFILTERCOLS_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
@ -113,18 +114,18 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)
|
||||
#define HAS_FIXEDDIV1_MIPS
|
||||
#define HAS_FIXEDDIV_MIPS
|
||||
#define HAS_SCALEADDROW_16_MMI
|
||||
#define HAS_SCALEADDROW_MMI
|
||||
#define HAS_SCALEARGBCOLS_MMI
|
||||
#define HAS_SCALEARGBCOLSUP2_MMI
|
||||
#define HAS_SCALEARGBROWDOWN2_MMI
|
||||
#define HAS_SCALEARGBROWDOWNEVEN_MMI
|
||||
#define HAS_SCALEROWDOWN2_MMI
|
||||
#define HAS_SCALEROWDOWN4_MMI
|
||||
#define HAS_SCALEADDROW_MMI
|
||||
#define HAS_SCALEADDROW_16_MMI
|
||||
#define HAS_SCALEROWDOWN2_16_MMI
|
||||
#define HAS_SCALEROWDOWN4_16_MMI
|
||||
#define HAS_SCALECOLS_MMI
|
||||
#define HAS_SCALECOLS_16_MMI
|
||||
#define HAS_SCALECOLS_MMI
|
||||
#define HAS_SCALEROWDOWN2_16_MMI
|
||||
#define HAS_SCALEROWDOWN2_MMI
|
||||
#define HAS_SCALEROWDOWN4_16_MMI
|
||||
#define HAS_SCALEROWDOWN4_MMI
|
||||
#endif
|
||||
|
||||
// Scale ARGB vertically with bilinear interpolation.
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1721
|
||||
#define LIBYUV_VERSION 1722
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -1980,9 +1980,8 @@ int NV12ToRAW(const uint8_t* src_y,
|
||||
int dst_stride_raw,
|
||||
int width,
|
||||
int height) {
|
||||
return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
|
||||
dst_raw, dst_stride_raw, &kYvuI601Constants,
|
||||
width, height);
|
||||
return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw,
|
||||
dst_stride_raw, &kYvuI601Constants, width, height);
|
||||
}
|
||||
|
||||
// Convert NV21 to RAW.
|
||||
@ -1995,9 +1994,8 @@ int NV21ToRAW(const uint8_t* src_y,
|
||||
int dst_stride_raw,
|
||||
int width,
|
||||
int height) {
|
||||
return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
|
||||
dst_raw, dst_stride_raw, &kYvuI601Constants,
|
||||
width, height);
|
||||
return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw,
|
||||
dst_stride_raw, &kYvuI601Constants, width, height);
|
||||
}
|
||||
|
||||
// Convert M420 to ARGB.
|
||||
|
||||
@ -8,6 +8,8 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <string.h> // For memset/memcpy
|
||||
|
||||
#include "libyuv/scale.h"
|
||||
#include "libyuv/scale_row.h"
|
||||
|
||||
@ -499,6 +501,45 @@ SDAANY(ScaleARGBRowDownEvenBox_Any_MMI,
|
||||
1)
|
||||
#endif
|
||||
|
||||
#ifdef SASIMDONLY
|
||||
// This also works and uses memcpy and SIMD instead of C, but is slower on ARM
|
||||
|
||||
// Add rows box filter scale down. Using macro from row_any
|
||||
#define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \
|
||||
SIMD_ALIGNED(uint16_t dst_temp[32]); \
|
||||
SIMD_ALIGNED(uint8_t src_temp[32]); \
|
||||
memset(dst_temp, 0, 32 * 2); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, n); \
|
||||
} \
|
||||
memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \
|
||||
memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \
|
||||
ANY_SIMD(src_temp, dst_temp, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALEADDROW_SSE2
|
||||
SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_AVX2
|
||||
SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_NEON
|
||||
SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_MSA
|
||||
SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEADDROW_MMI
|
||||
SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#else
|
||||
|
||||
// Add rows box filter scale down.
|
||||
#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
|
||||
@ -526,6 +567,8 @@ SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7)
|
||||
#endif
|
||||
#undef SAANY
|
||||
|
||||
#endif // SASIMDONLY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -504,37 +504,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
|
||||
}
|
||||
|
||||
void ScaleAddRows_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width,
|
||||
int src_height) {
|
||||
const uint8_t* src_tmp;
|
||||
// Add a row of bytes to a row of shorts. Used for box filter.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
"mov r12, %5 \n"
|
||||
"veor q2, q2, q2 \n"
|
||||
"veor q3, q3, q3 \n"
|
||||
"2: \n"
|
||||
// load 16 pixels into q0
|
||||
"vld1.8 {q0}, [%0], %3 \n"
|
||||
"vaddw.u8 q3, q3, d1 \n"
|
||||
"vaddw.u8 q2, q2, d0 \n"
|
||||
"subs r12, r12, #1 \n"
|
||||
"bgt 2b \n"
|
||||
"vst1.16 {q2, q3}, [%2]! \n" // store pixels
|
||||
"add %1, %1, #16 \n"
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop
|
||||
"vld1.16 {q1, q2}, [%1] \n" // load accumulator
|
||||
"vld1.8 {q0}, [%0]! \n" // load 16 bytes
|
||||
"vaddw.u8 q2, q2, d1 \n" // add
|
||||
"vaddw.u8 q1, q1, d0 \n"
|
||||
"vst1.16 {q1, q2}, [%1]! \n" // store accumulator
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
"bgt 1b \n"
|
||||
: "=&r"(src_tmp), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(src_stride), // %3
|
||||
"+r"(src_width), // %4
|
||||
"+r"(src_height) // %5
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
|
||||
: "memory", "cc", "q0", "q1", "q2" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@ -515,37 +515,25 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
"v19", "v30", "v31", "memory", "cc");
|
||||
}
|
||||
|
||||
void ScaleAddRows_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width,
|
||||
int src_height) {
|
||||
const uint8_t* src_tmp;
|
||||
// Add a row of bytes to a row of shorts. Used for box filter.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int src_width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"mov %0, %1 \n"
|
||||
"mov w12, %w5 \n"
|
||||
"eor v2.16b, v2.16b, v2.16b \n"
|
||||
"eor v3.16b, v3.16b, v3.16b \n"
|
||||
"2: \n"
|
||||
// load 16 pixels into q0
|
||||
"ld1 {v0.16b}, [%0], %3 \n"
|
||||
"uaddw2 v3.8h, v3.8h, v0.16b \n"
|
||||
"uaddw v2.8h, v2.8h, v0.8b \n"
|
||||
"subs w12, w12, #1 \n"
|
||||
"b.gt 2b \n"
|
||||
"st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
|
||||
"add %1, %1, #16 \n"
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop
|
||||
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
|
||||
"uaddw2 v2.8h, v2.8h, v0.16b \n" // add
|
||||
"uaddw v1.8h, v1.8h, v0.8b \n"
|
||||
"st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||
"b.gt 1b \n"
|
||||
: "=&r"(src_tmp), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(src_stride), // %3
|
||||
"+r"(src_width), // %4
|
||||
"+r"(src_height) // %5
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(src_width) // %2
|
||||
:
|
||||
: "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
|
||||
: "memory", "cc", "v0", "v1", "v2" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@ -1693,7 +1693,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420) {
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
int half_width = (width + 1) / 2;
|
||||
int half_height = (height + 1)/ 2;
|
||||
int half_height = (height + 1) / 2;
|
||||
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
|
||||
benchmark_height_ / (width * height);
|
||||
|
||||
@ -1727,7 +1727,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) {
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
int half_width = (width + 1) / 2;
|
||||
int half_height = (height + 1)/ 2;
|
||||
int half_height = (height + 1) / 2;
|
||||
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
|
||||
benchmark_height_ / (width * height);
|
||||
|
||||
@ -1786,7 +1786,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) {
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
int half_width = (width + 1) / 2;
|
||||
int half_height = (height + 1)/ 2;
|
||||
int half_height = (height + 1) / 2;
|
||||
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
|
||||
benchmark_height_ / (width * height);
|
||||
|
||||
@ -1816,7 +1816,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) {
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
int half_width = (width + 1) / 2;
|
||||
int half_height = (height + 1)/ 2;
|
||||
int half_height = (height + 1) / 2;
|
||||
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
|
||||
benchmark_height_ / (width * height);
|
||||
|
||||
@ -1846,7 +1846,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) {
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
int half_width = (width + 1) / 2;
|
||||
int half_height = (height + 1)/ 2;
|
||||
int half_height = (height + 1) / 2;
|
||||
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
|
||||
benchmark_height_ / (width * height);
|
||||
|
||||
@ -1876,7 +1876,7 @@ TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) {
|
||||
EXPECT_EQ(0, ret);
|
||||
|
||||
int half_width = (width + 1) / 2;
|
||||
int half_height = (height + 1)/ 2;
|
||||
int half_height = (height + 1) / 2;
|
||||
int benchmark_iterations = benchmark_iterations_ * benchmark_width_ *
|
||||
benchmark_height_ / (width * height);
|
||||
|
||||
|
||||
@ -303,10 +303,10 @@ TEST_FACTOR(3, 1, 3)
|
||||
|
||||
TEST_SCALETO(ARGBScale, 1, 1)
|
||||
TEST_SCALETO(ARGBScale, 320, 240)
|
||||
TEST_SCALETO(ARGBScale, 352, 288)
|
||||
TEST_SCALETO(ARGBScale, 569, 480)
|
||||
TEST_SCALETO(ARGBScale, 640, 360)
|
||||
TEST_SCALETO(ARGBScale, 1280, 720)
|
||||
TEST_SCALETO(ARGBScale, 1920, 1080)
|
||||
#undef TEST_SCALETO1
|
||||
#undef TEST_SCALETO
|
||||
|
||||
|
||||
@ -336,10 +336,10 @@ TEST_FACTOR(3, 1, 3, 0)
|
||||
|
||||
TEST_SCALETO(Scale, 1, 1)
|
||||
TEST_SCALETO(Scale, 320, 240)
|
||||
TEST_SCALETO(Scale, 352, 288)
|
||||
TEST_SCALETO(Scale, 569, 480)
|
||||
TEST_SCALETO(Scale, 640, 360)
|
||||
TEST_SCALETO(Scale, 1280, 720)
|
||||
TEST_SCALETO(Scale, 1920, 1080)
|
||||
#undef TEST_SCALETO1
|
||||
#undef TEST_SCALETO
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user