mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-09 03:06:44 +08:00
Change ScaleSumSamples to return Sum of Squares
TBR=kjellander@chromium.org BUG=libyuv:717 TEST=LibYUVPlanarTest.TestScaleSumSamples_Opt Change-Id: I5208666f3968c5c4b0f1b0c951f24216d78ee3fe Reviewed-on: https://chromium-review.googlesource.com/607184 Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
8676ad7004
commit
83ca1abe09
7
BUILD.gn
7
BUILD.gn
@ -158,10 +158,11 @@ static_library("libyuv_internal") {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# To enable AVX2 or other cpu optimization, pass flag here
|
# To enable AVX2 or other cpu optimization, pass flag here
|
||||||
# cflags = [ "-mavx2" ]
|
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
|
||||||
# cflags = [ "-mpopcnt" ]
|
if (!is_win) {
|
||||||
|
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (libyuv_use_neon) {
|
if (libyuv_use_neon) {
|
||||||
static_library("libyuv_neon") {
|
static_library("libyuv_neon") {
|
||||||
sources = [
|
sources = [
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1664
|
Version: 1665
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1664
|
#define LIBYUV_VERSION 1665
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -2642,10 +2642,13 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
|
|||||||
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
|
float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
|
||||||
float fmax = 0.f;
|
float fmax = 0.f;
|
||||||
int i;
|
int i;
|
||||||
|
#if defined(__clang__)
|
||||||
|
#pragma clang loop vectorize_width(4)
|
||||||
|
#endif
|
||||||
for (i = 0; i < width; ++i) {
|
for (i = 0; i < width; ++i) {
|
||||||
float v = *src++ * scale;
|
float v = *src++;
|
||||||
*dst++ = v;
|
fmax += v * v;
|
||||||
fmax = (v > fmax) ? v : fmax;
|
*dst++ = v * scale;
|
||||||
}
|
}
|
||||||
return fmax;
|
return fmax;
|
||||||
}
|
}
|
||||||
@ -2653,8 +2656,7 @@ float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
|
|||||||
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
|
void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < width; ++i) {
|
for (i = 0; i < width; ++i) {
|
||||||
float v = *src++ * scale;
|
*dst++ = *src++ * scale;
|
||||||
*dst++ = v;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -2616,30 +2616,33 @@ float ScaleSumSamples_NEON(const float* src,
|
|||||||
float* dst,
|
float* dst,
|
||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
float fmax;
|
float fsum;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"movi v3.4s, #0 \n" // max
|
"movi v5.4s, #0 \n" // max
|
||||||
"movi v4.4s, #0 \n" // max
|
"movi v6.4s, #0 \n" // max
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
"ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||||
"fmul v1.4s, v1.4s, %4.s[0] \n" // scale
|
"fmul v3.4s, v1.4s, %4.s[0] \n" // scale
|
||||||
"fmul v2.4s, v2.4s, %4.s[0] \n" // scale
|
"fmul v4.4s, v2.4s, %4.s[0] \n"
|
||||||
"st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
|
"fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
|
||||||
"fmax v3.4s, v3.4s, v1.4s \n" // max
|
"fmla v6.4s, v2.4s, v2.4s \n"
|
||||||
"fmax v4.4s, v4.4s, v2.4s \n"
|
"st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
|
||||||
|
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
"fmax v3.4s, v3.4s, v4.4s \n" // max
|
"faddp v5.4s, v5.4s, v6.4s \n"
|
||||||
"fmaxv %s3, v3.4s \n" // signed max acculator
|
"faddp v5.4s, v5.4s, v5.4s \n"
|
||||||
|
"faddp v5.4s, v5.4s, v5.4s \n"
|
||||||
|
"fmov %w3, s5 \n" // sum
|
||||||
|
|
||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(dst), // %1
|
"+r"(dst), // %1
|
||||||
"+r"(width), // %2
|
"+r"(width), // %2
|
||||||
"=w"(fmax) // %3
|
"=w"(fsum) // %3
|
||||||
: "w"(scale) // %4
|
: "w"(scale) // %4
|
||||||
: "cc", "memory", "v1", "v2", "v3", "v4");
|
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||||
return fmax;
|
return fsum;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
|
||||||
|
|||||||
@ -2527,7 +2527,7 @@ float TestScaleSumSamples(int benchmark_width,
|
|||||||
float scale,
|
float scale,
|
||||||
bool opt) {
|
bool opt) {
|
||||||
int i, j;
|
int i, j;
|
||||||
float max_c, max_opt;
|
float sum_c, sum_opt = 0.f;
|
||||||
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
||||||
|
|
||||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
align_buffer_page_end(orig_y, y_plane_size * 3);
|
||||||
@ -2542,32 +2542,29 @@ float TestScaleSumSamples(int benchmark_width,
|
|||||||
memset(dst_c, 0, y_plane_size);
|
memset(dst_c, 0, y_plane_size);
|
||||||
memset(dst_opt, 1, y_plane_size);
|
memset(dst_opt, 1, y_plane_size);
|
||||||
|
|
||||||
// Disable all optimizations.
|
sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
max_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
|
||||||
reinterpret_cast<float*>(dst_c), scale,
|
reinterpret_cast<float*>(dst_c), scale,
|
||||||
benchmark_width * benchmark_height);
|
benchmark_width * benchmark_height);
|
||||||
|
|
||||||
// Enable optimizations.
|
|
||||||
for (j = 0; j < benchmark_iterations; j++) {
|
for (j = 0; j < benchmark_iterations; j++) {
|
||||||
#ifdef HAS_SCALESUMSAMPLES_NEON
|
|
||||||
if (opt) {
|
if (opt) {
|
||||||
max_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
|
#ifdef HAS_SCALESUMSAMPLES_NEON
|
||||||
|
sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
benchmark_width * benchmark_height);
|
benchmark_width * benchmark_height);
|
||||||
|
#else
|
||||||
|
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
|
benchmark_width * benchmark_height);
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
benchmark_width * benchmark_height);
|
benchmark_width * benchmark_height);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
max_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_diff = 0;
|
float max_diff = FAbs(sum_opt - sum_c);
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
for (i = 0; i < y_plane_size / 4; ++i) {
|
||||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||||
@ -2613,32 +2610,29 @@ float TestScaleSamples(int benchmark_width,
|
|||||||
memset(dst_c, 0, y_plane_size);
|
memset(dst_c, 0, y_plane_size);
|
||||||
memset(dst_opt, 1, y_plane_size);
|
memset(dst_opt, 1, y_plane_size);
|
||||||
|
|
||||||
// Disable all optimizations.
|
|
||||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_c), scale,
|
reinterpret_cast<float*>(dst_c), scale,
|
||||||
benchmark_width * benchmark_height);
|
benchmark_width * benchmark_height);
|
||||||
|
|
||||||
// Enable optimizations.
|
|
||||||
for (j = 0; j < benchmark_iterations; j++) {
|
for (j = 0; j < benchmark_iterations; j++) {
|
||||||
#ifdef HAS_SCALESAMPLES_NEON
|
|
||||||
if (opt) {
|
if (opt) {
|
||||||
max_opt = ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
|
#ifdef HAS_SCALESUMSAMPLES_NEON
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||||
benchmark_width * benchmark_height);
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
|
benchmark_width * benchmark_height);
|
||||||
|
#else
|
||||||
|
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
|
benchmark_width * benchmark_height);
|
||||||
|
#endif
|
||||||
} else {
|
} else {
|
||||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
benchmark_width * benchmark_height);
|
benchmark_width * benchmark_height);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_diff = 0;
|
float max_diff =0.f;
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
for (i = 0; i < y_plane_size / 4; ++i) {
|
||||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user