mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-09 03:06:44 +08:00
Change TestScaleSumSamples_C test to allow for some float error in sum.
The sum of floats can optimize differently with vectorization, producing a different result between NEON and C. Adjust the unittest to allow for some difference in the sum. The NEON version is 8 samples at a time, so the test now rounds up the number of values to multiple of 8. TBR=kjellander@chromium.org Bug: libyuv:717 Test: LibYUVPlanarTest.TestScaleSumSamples_Opt Change-Id: I2a0783780c7e0f240f7a8e4700b2a4d3e6b52d87 Reviewed-on: https://chromium-review.googlesource.com/673708 Reviewed-by: Cheng Wang <wangcheng@google.com>
This commit is contained in:
parent
efbf15754a
commit
0b0a891cb2
3
BUILD.gn
3
BUILD.gn
@ -160,7 +160,7 @@ static_library("libyuv_internal") {
|
|||||||
# To enable AVX2 or other cpu optimization, pass flag here
|
# To enable AVX2 or other cpu optimization, pass flag here
|
||||||
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
|
# cflags = [ "-mavx2", "-mpopcnt", "-mavx2", "-mfma" ]
|
||||||
if (!is_win) {
|
if (!is_win) {
|
||||||
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
|
cflags = [ "-ffp-contract=fast" ] # Enable fma vectorization for NEON.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (libyuv_use_neon) {
|
if (libyuv_use_neon) {
|
||||||
@ -185,6 +185,7 @@ if (libyuv_use_neon) {
|
|||||||
configs -= [ "//build/config/compiler:default_optimization" ]
|
configs -= [ "//build/config/compiler:default_optimization" ]
|
||||||
|
|
||||||
# Enable optimize for speed (-O2) over size (-Os).
|
# Enable optimize for speed (-O2) over size (-Os).
|
||||||
|
# TODO(fbarchard): Consider optimize_speed which is O3.
|
||||||
configs += [ "//build/config/compiler:optimize_max" ]
|
configs += [ "//build/config/compiler:optimize_max" ]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -8,6 +8,7 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <math.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
@ -2623,44 +2624,44 @@ float TestScaleMaxSamples(int benchmark_width,
|
|||||||
bool opt) {
|
bool opt) {
|
||||||
int i, j;
|
int i, j;
|
||||||
float max_c, max_opt = 0.f;
|
float max_c, max_opt = 0.f;
|
||||||
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
// NEON does multiple of 8, so round count up
|
||||||
|
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
|
||||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
align_buffer_page_end(orig_y, kPixels * 4 * 3);
|
||||||
uint8* dst_opt = orig_y + y_plane_size;
|
uint8* dst_c = orig_y + kPixels * 4;
|
||||||
uint8* dst_c = orig_y + y_plane_size * 2;
|
uint8* dst_opt = orig_y + kPixels * 4 * 2;
|
||||||
|
|
||||||
// Randomize works but may contain some denormals affecting performance.
|
// Randomize works but may contain some denormals affecting performance.
|
||||||
// MemRandomize(orig_y, y_plane_size);
|
// MemRandomize(orig_y, kPixels * 4);
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
// large values are problematic. audio is really -1 to 1.
|
||||||
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
|
for (i = 0; i < kPixels; ++i) {
|
||||||
|
(reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
|
||||||
}
|
}
|
||||||
memset(dst_c, 0, y_plane_size);
|
memset(dst_c, 0, kPixels * 4);
|
||||||
memset(dst_opt, 1, y_plane_size);
|
memset(dst_opt, 1, kPixels * 4);
|
||||||
|
|
||||||
max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
|
max_c = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_c), scale,
|
reinterpret_cast<float*>(dst_c), scale, kPixels);
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
|
|
||||||
for (j = 0; j < benchmark_iterations; j++) {
|
for (j = 0; j < benchmark_iterations; j++) {
|
||||||
if (opt) {
|
if (opt) {
|
||||||
#ifdef HAS_SCALESUMSAMPLES_NEON
|
#ifdef HAS_SCALESUMSAMPLES_NEON
|
||||||
max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y),
|
max_opt = ScaleMaxSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
benchmark_width * benchmark_height);
|
kPixels);
|
||||||
#else
|
#else
|
||||||
max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
|
max_opt =
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
benchmark_width * benchmark_height);
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
max_opt = ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
|
max_opt =
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
ScaleMaxSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
benchmark_width * benchmark_height);
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_diff = FAbs(max_opt - max_c);
|
float max_diff = FAbs(max_opt - max_c);
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
for (i = 0; i < kPixels; ++i) {
|
||||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||||
if (abs_diff > max_diff) {
|
if (abs_diff > max_diff) {
|
||||||
@ -2691,44 +2692,55 @@ float TestScaleSumSamples(int benchmark_width,
|
|||||||
bool opt) {
|
bool opt) {
|
||||||
int i, j;
|
int i, j;
|
||||||
float sum_c, sum_opt = 0.f;
|
float sum_c, sum_opt = 0.f;
|
||||||
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
// NEON does multiple of 8, so round count up
|
||||||
|
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
|
||||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
align_buffer_page_end(orig_y, kPixels * 4 * 3);
|
||||||
uint8* dst_opt = orig_y + y_plane_size;
|
uint8* dst_c = orig_y + kPixels * 4;
|
||||||
uint8* dst_c = orig_y + y_plane_size * 2;
|
uint8* dst_opt = orig_y + kPixels * 4 * 2;
|
||||||
|
|
||||||
// Randomize works but may contain some denormals affecting performance.
|
// Randomize works but may contain some denormals affecting performance.
|
||||||
// MemRandomize(orig_y, y_plane_size);
|
// MemRandomize(orig_y, kPixels * 4);
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
// large values are problematic. audio is really -1 to 1.
|
||||||
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
|
for (i = 0; i < kPixels; ++i) {
|
||||||
|
(reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
|
||||||
}
|
}
|
||||||
memset(dst_c, 0, y_plane_size);
|
memset(dst_c, 0, kPixels * 4);
|
||||||
memset(dst_opt, 1, y_plane_size);
|
memset(dst_opt, 1, kPixels * 4);
|
||||||
|
|
||||||
sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
sum_c = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_c), scale,
|
reinterpret_cast<float*>(dst_c), scale, kPixels);
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
|
|
||||||
for (j = 0; j < benchmark_iterations; j++) {
|
for (j = 0; j < benchmark_iterations; j++) {
|
||||||
if (opt) {
|
if (opt) {
|
||||||
#ifdef HAS_SCALESUMSAMPLES_NEON
|
#ifdef HAS_SCALESUMSAMPLES_NEON
|
||||||
sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
|
sum_opt = ScaleSumSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale,
|
||||||
benchmark_width * benchmark_height);
|
kPixels);
|
||||||
#else
|
#else
|
||||||
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
sum_opt =
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
benchmark_width * benchmark_height);
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
sum_opt = ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
sum_opt =
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
ScaleSumSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
benchmark_width * benchmark_height);
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_diff = FAbs(sum_opt - sum_c);
|
float mse_opt = sum_opt / kPixels * 4;
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
float mse_c = sum_c / kPixels * 4;
|
||||||
|
float mse_error = FAbs(mse_opt - mse_c) / mse_c;
|
||||||
|
|
||||||
|
// If the sum of a float is more than 4 million, small adds are round down on
|
||||||
|
// float and produce different results with vectorized sum vs scalar sum.
|
||||||
|
// Ignore the difference if the sum is large.
|
||||||
|
float max_diff = 0.f;
|
||||||
|
if (mse_error > 0.0001 && sum_c < 4000000) { // allow .01% difference of mse
|
||||||
|
max_diff = mse_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < kPixels; ++i) {
|
||||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||||
if (abs_diff > max_diff) {
|
if (abs_diff > max_diff) {
|
||||||
@ -2758,45 +2770,41 @@ float TestScaleSamples(int benchmark_width,
|
|||||||
float scale,
|
float scale,
|
||||||
bool opt) {
|
bool opt) {
|
||||||
int i, j;
|
int i, j;
|
||||||
const int y_plane_size = benchmark_width * benchmark_height * 4;
|
// NEON does multiple of 8, so round count up
|
||||||
|
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
|
||||||
align_buffer_page_end(orig_y, y_plane_size * 3);
|
align_buffer_page_end(orig_y, kPixels * 4 * 3);
|
||||||
uint8* dst_opt = orig_y + y_plane_size;
|
uint8* dst_c = orig_y + kPixels * 4;
|
||||||
uint8* dst_c = orig_y + y_plane_size * 2;
|
uint8* dst_opt = orig_y + kPixels * 4 * 2;
|
||||||
|
|
||||||
// Randomize works but may contain some denormals affecting performance.
|
// Randomize works but may contain some denormals affecting performance.
|
||||||
// MemRandomize(orig_y, y_plane_size);
|
// MemRandomize(orig_y, kPixels * 4);
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
// large values are problematic. audio is really -1 to 1.
|
||||||
(reinterpret_cast<float*>(orig_y))[i] = (i - y_plane_size / 8) * 3.1415f;
|
for (i = 0; i < kPixels; ++i) {
|
||||||
|
(reinterpret_cast<float*>(orig_y))[i] = sinf(static_cast<float>(i) * 0.1f);
|
||||||
}
|
}
|
||||||
|
memset(dst_c, 0, kPixels * 4);
|
||||||
memset(dst_c, 0, y_plane_size);
|
memset(dst_opt, 1, kPixels * 4);
|
||||||
memset(dst_opt, 1, y_plane_size);
|
|
||||||
|
|
||||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_c), scale,
|
reinterpret_cast<float*>(dst_c), scale, kPixels);
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
|
|
||||||
for (j = 0; j < benchmark_iterations; j++) {
|
for (j = 0; j < benchmark_iterations; j++) {
|
||||||
if (opt) {
|
if (opt) {
|
||||||
#ifdef HAS_SCALESUMSAMPLES_NEON
|
#ifdef HAS_SCALESUMSAMPLES_NEON
|
||||||
ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
|
ScaleSamples_NEON(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
#else
|
#else
|
||||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
ScaleSamples_C(reinterpret_cast<float*>(orig_y),
|
||||||
reinterpret_cast<float*>(dst_opt), scale,
|
reinterpret_cast<float*>(dst_opt), scale, kPixels);
|
||||||
benchmark_width * benchmark_height);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float max_diff = 0.f;
|
float max_diff = 0.f;
|
||||||
for (i = 0; i < y_plane_size / 4; ++i) {
|
for (i = 0; i < kPixels; ++i) {
|
||||||
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
float abs_diff = FAbs((reinterpret_cast<float*>(dst_c)[i]) -
|
||||||
(reinterpret_cast<float*>(dst_opt)[i]));
|
(reinterpret_cast<float*>(dst_opt)[i]));
|
||||||
if (abs_diff > max_diff) {
|
if (abs_diff > max_diff) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user