mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
UVScale down by 2 fix for C and optimize for NEON
- update cpu_id to use "re" for fopen to avoid leaking handles if a thread is started while the file is open. Bug: libyuv:958 Change-Id: I1af9de68fce12e440e1226fc8070634ccb1bf090 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4417176 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
ee3e71c7ce
commit
68659d0d68
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1865
|
||||
Version: 1866
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -133,6 +133,8 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN34_NEON
|
||||
#define HAS_SCALEROWDOWN38_NEON
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#define HAS_SCALEUVROWDOWN2_NEON
|
||||
#define HAS_SCALEUVROWDOWN2LINEAR_NEON
|
||||
#define HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
#define HAS_SCALEUVROWDOWNEVEN_NEON
|
||||
#define HAS_SCALEROWUP2_LINEAR_NEON
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1865
|
||||
#define LIBYUV_VERSION 1866
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -137,7 +137,7 @@ static int GetXCR0() {
|
||||
// For Arm, but public to allow testing on any CPU
|
||||
LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
|
||||
char cpuinfo_line[512];
|
||||
FILE* f = fopen(cpuinfo_name, "r");
|
||||
FILE* f = fopen(cpuinfo_name, "re");
|
||||
if (!f) {
|
||||
// Assume Neon if /proc/cpuinfo is unavailable.
|
||||
// This will occur for Chrome sandbox for Pepper or Render process.
|
||||
@ -166,7 +166,7 @@ LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
|
||||
LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
|
||||
char cpuinfo_line[512];
|
||||
int flag = 0x0;
|
||||
FILE* f = fopen(cpuinfo_name, "r");
|
||||
FILE* f = fopen(cpuinfo_name, "re");
|
||||
if (!f) {
|
||||
// Assume nothing if /proc/cpuinfo is unavailable.
|
||||
// This will occur for Chrome sandbox for Pepper or Render process.
|
||||
@ -194,7 +194,7 @@ LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) {
|
||||
LIBYUV_API SAFEBUFFERS int RiscvCpuCaps(const char* cpuinfo_name) {
|
||||
char cpuinfo_line[512];
|
||||
int flag = 0x0;
|
||||
FILE* f = fopen(cpuinfo_name, "r");
|
||||
FILE* f = fopen(cpuinfo_name, "re");
|
||||
if (!f) {
|
||||
// Assume nothing if /proc/cpuinfo is unavailable.
|
||||
// This will occur for Chrome sandbox for Pepper or Render process.
|
||||
|
||||
@ -128,6 +128,22 @@ SDODD(ScaleRowDown2Box_Odd_NEON,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2_NEON
|
||||
SDANY(ScaleUVRowDown2_Any_NEON,
|
||||
ScaleUVRowDown2_NEON,
|
||||
ScaleUVRowDown2_C,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2LINEAR_NEON
|
||||
SDANY(ScaleUVRowDown2Linear_Any_NEON,
|
||||
ScaleUVRowDown2Linear_NEON,
|
||||
ScaleUVRowDown2Linear_C,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
SDANY(ScaleUVRowDown2Box_Any_NEON,
|
||||
ScaleUVRowDown2Box_NEON,
|
||||
|
||||
@ -1280,18 +1280,13 @@ void ScaleUVRowDown2_C(const uint8_t* src_uv,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_uv,
|
||||
int dst_width) {
|
||||
const uint16_t* src = (const uint16_t*)(src_uv);
|
||||
uint16_t* dst = (uint16_t*)(dst_uv);
|
||||
int x;
|
||||
(void)src_stride;
|
||||
for (x = 0; x < dst_width - 1; x += 2) {
|
||||
dst[0] = src[1];
|
||||
dst[1] = src[3];
|
||||
src += 2;
|
||||
dst += 2;
|
||||
}
|
||||
if (dst_width & 1) {
|
||||
dst[0] = src[1];
|
||||
for (x = 0; x < dst_width; ++x) {
|
||||
dst_uv[0] = src_uv[2]; // Store the 2nd UV
|
||||
dst_uv[1] = src_uv[3];
|
||||
src_uv += 4;
|
||||
dst_uv += 2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -1428,6 +1428,45 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
|
||||
|
||||
#undef LOAD2_DATA32_LANE
|
||||
|
||||
void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vst1.16 {q1}, [%1]! \n" // store 8 UV
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "q0", "q1");
|
||||
}
|
||||
|
||||
void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
|
||||
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
|
||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
||||
"vrhadd.u8 q0, q0, q1 \n" // rounding half add
|
||||
"vst1.16 {q0}, [%1]! \n" // store 8 UV
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "q0", "q1");
|
||||
}
|
||||
|
||||
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
|
||||
@ -1568,6 +1568,45 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st1 {v1.8h}, [%1], #16 \n" // store 8 UV
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "v0", "v1");
|
||||
}
|
||||
|
||||
void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_width) {
|
||||
(void)src_stride;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
||||
"urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"st1 {v0.8h}, [%1], #16 \n" // store 8 UV
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "v0", "v1");
|
||||
}
|
||||
|
||||
void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst,
|
||||
|
||||
@ -112,6 +112,22 @@ static void ScaleUVDown2(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEUVROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_Any_NEON
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
|
||||
: ScaleUVRowDown2Box_Any_NEON);
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_NEON
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
|
||||
: ScaleUVRowDown2Box_NEON);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// This code is not enabled. Only box filter is available at this time.
|
||||
#if defined(HAS_SCALEUVROWDOWN2_SSSE3)
|
||||
@ -130,23 +146,7 @@ static void ScaleUVDown2(int src_width,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// This code is not enabled. Only box filter is available at this time.
|
||||
#if defined(HAS_SCALEUVROWDOWN2_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_Any_NEON
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON
|
||||
: ScaleUVRowDown2Box_Any_NEON);
|
||||
if (IS_ALIGNED(dst_width, 8)) {
|
||||
ScaleUVRowDown2 =
|
||||
filtering == kFilterNone
|
||||
? ScaleUVRowDown2_NEON
|
||||
: (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON
|
||||
: ScaleUVRowDown2Box_NEON);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_SCALEUVROWDOWN2_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
ScaleUVRowDown2 =
|
||||
|
||||
@ -39,55 +39,35 @@ static int UVTestFilter(int src_width,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int i, j;
|
||||
const int b = 0; // 128 to test for padding/stride.
|
||||
int64_t src_uv_plane_size =
|
||||
(Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL;
|
||||
int src_stride_uv = (b * 2 + Abs(src_width)) * 2;
|
||||
int i;
|
||||
int64_t src_uv_plane_size = Abs(src_width) * Abs(src_height) * 2LL;
|
||||
int src_stride_uv = Abs(src_width) * 2;
|
||||
int64_t dst_uv_plane_size = dst_width * dst_height * 2LL;
|
||||
int dst_stride_uv = dst_width * 2;
|
||||
|
||||
align_buffer_page_end(src_uv, src_uv_plane_size);
|
||||
if (!src_uv) {
|
||||
align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
|
||||
align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
|
||||
|
||||
if (!src_uv || !dst_uv_c || !dst_uv_opt) {
|
||||
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
|
||||
return 0;
|
||||
}
|
||||
MemRandomize(src_uv, src_uv_plane_size);
|
||||
|
||||
int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL;
|
||||
int dst_stride_uv = (b * 2 + dst_width) * 2;
|
||||
|
||||
align_buffer_page_end(dst_uv_c, dst_uv_plane_size);
|
||||
align_buffer_page_end(dst_uv_opt, dst_uv_plane_size);
|
||||
if (!dst_uv_c || !dst_uv_opt) {
|
||||
printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
|
||||
return 0;
|
||||
}
|
||||
memset(dst_uv_c, 2, dst_uv_plane_size);
|
||||
memset(dst_uv_opt, 3, dst_uv_plane_size);
|
||||
|
||||
// Warm up both versions for consistent benchmarks.
|
||||
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
memset(dst_uv_opt, 123, dst_uv_plane_size);
|
||||
|
||||
MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization.
|
||||
double c_time = get_time();
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
|
||||
UVScale(src_uv, src_stride_uv, src_width, src_height,
|
||||
dst_uv_c, dst_stride_uv, dst_width, dst_height, f);
|
||||
c_time = (get_time() - c_time);
|
||||
|
||||
MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization.
|
||||
double opt_time = get_time();
|
||||
for (i = 0; i < benchmark_iterations; ++i) {
|
||||
UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width,
|
||||
src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv,
|
||||
dst_width, dst_height, f);
|
||||
UVScale(src_uv, src_stride_uv, src_width, src_height,
|
||||
dst_uv_opt, dst_stride_uv, dst_width, dst_height, f);
|
||||
}
|
||||
opt_time = (get_time() - opt_time) / benchmark_iterations;
|
||||
|
||||
@ -95,20 +75,13 @@ static int UVTestFilter(int src_width,
|
||||
printf("filter %d - %8d us C - %8d us OPT\n", f,
|
||||
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
|
||||
|
||||
// C version may be a little off from the optimized. Order of
|
||||
// operations may introduce rounding somewhere. So do a difference
|
||||
// of the buffers and look to see that the max difference isn't
|
||||
// over 2.
|
||||
int max_diff = 0;
|
||||
for (i = b; i < (dst_height + b); ++i) {
|
||||
for (j = b * 2; j < (dst_width + b) * 2; ++j) {
|
||||
int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] -
|
||||
dst_uv_opt[(i * dst_stride_uv) + j]);
|
||||
for (i = 0; i < dst_uv_plane_size; ++i) {
|
||||
int abs_diff = Abs(dst_uv_c[i] - dst_uv_opt[i]);
|
||||
if (abs_diff > max_diff) {
|
||||
max_diff = abs_diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(dst_uv_c);
|
||||
free_aligned_buffer_page_end(dst_uv_opt);
|
||||
@ -121,28 +94,27 @@ static int UVTestFilter(int src_width,
|
||||
#define DX(x, nom, denom) static_cast<int>((Abs(x) / nom) * nom)
|
||||
#define SX(x, nom, denom) static_cast<int>((x / nom) * denom)
|
||||
|
||||
#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \
|
||||
#define TEST_FACTOR1(name, filter, nom, denom) \
|
||||
TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \
|
||||
int diff = UVTestFilter( \
|
||||
SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
|
||||
DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
|
||||
kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \
|
||||
benchmark_cpu_info_); \
|
||||
EXPECT_LE(diff, max_diff); \
|
||||
EXPECT_EQ(0, diff); \
|
||||
}
|
||||
|
||||
#if defined(ENABLE_FULL_TESTS)
|
||||
// Test a scale factor with all 4 filters. Expect unfiltered to be exact, but
|
||||
// filtering is different fixed point implementations for SSSE3, Neon and C.
|
||||
// Test a scale factor with all 4 filters. Expect exact for SIMD vs C.
|
||||
#define TEST_FACTOR(name, nom, denom) \
|
||||
TEST_FACTOR1(name, None, nom, denom, 0) \
|
||||
TEST_FACTOR1(name, Linear, nom, denom, 3) \
|
||||
TEST_FACTOR1(name, Bilinear, nom, denom, 3) \
|
||||
TEST_FACTOR1(name, Box, nom, denom, 3)
|
||||
TEST_FACTOR1(name, None, nom, denom) \
|
||||
TEST_FACTOR1(name, Linear, nom, denom) \
|
||||
TEST_FACTOR1(name, Bilinear, nom, denom) \
|
||||
TEST_FACTOR1(name, Box, nom, denom)
|
||||
#else
|
||||
// Test a scale factor with Bilinear.
|
||||
#define TEST_FACTOR(name, nom, denom) \
|
||||
TEST_FACTOR1(name, Bilinear, nom, denom, 3)
|
||||
TEST_FACTOR1(name, Bilinear, nom, denom)
|
||||
#endif
|
||||
|
||||
TEST_FACTOR(2, 1, 2)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user