mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-07 02:09:50 +08:00
Floating point Gaussian kernels
On SkylakeX for 720p TestGaussPlane_F32 (657 ms) On Pixel3 TestGaussPlane_F32 (1787 ms) Bug: libyuv:852, b/145611468 Change-Id: I9859af1b9381621067992305727da285f82bdded Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1949667 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Marat Dukhan <maratek@google.com>
This commit is contained in:
parent
d82f4baf5f
commit
6e6f81b803
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1741
|
||||
Version: 1742
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -743,6 +743,19 @@ int ARGBBlur(const uint8_t* src_argb,
|
||||
int height,
|
||||
int radius);
|
||||
|
||||
// Gaussian 5x5 blur a float plane.
|
||||
// Coefficients of 1, 4, 6, 4, 1.
|
||||
// Each destination pixel is a blur of the 5x5
|
||||
// pixels from the source.
|
||||
// Source edges are clamped.
|
||||
LIBYUV_API
|
||||
int GaussPlane_F32(const float* src,
|
||||
int src_stride,
|
||||
float* dst,
|
||||
int dst_stride,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Multiply ARGB image by ARGB value.
|
||||
LIBYUV_API
|
||||
int ARGBShade(const uint8_t* src_argb,
|
||||
|
||||
@ -419,6 +419,9 @@ extern "C" {
|
||||
// The following are available on AArch64 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_SCALESUMSAMPLES_NEON
|
||||
#define HAS_GAUSSROW_F32_NEON
|
||||
#define HAS_GAUSSCOL_F32_NEON
|
||||
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
#define HAS_ABGRTOUVROW_MSA
|
||||
@ -601,6 +604,7 @@ extern "C" {
|
||||
#endif
|
||||
typedef __declspec(align(16)) int16_t vec16[8];
|
||||
typedef __declspec(align(16)) int32_t vec32[4];
|
||||
typedef __declspec(align(16)) float vecf32[4];
|
||||
typedef __declspec(align(16)) int8_t vec8[16];
|
||||
typedef __declspec(align(16)) uint16_t uvec16[8];
|
||||
typedef __declspec(align(16)) uint32_t uvec32[4];
|
||||
@ -620,6 +624,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32];
|
||||
#endif
|
||||
typedef int16_t __attribute__((vector_size(16))) vec16;
|
||||
typedef int32_t __attribute__((vector_size(16))) vec32;
|
||||
typedef float __attribute__((vector_size(16))) vecf32;
|
||||
typedef int8_t __attribute__((vector_size(16))) vec8;
|
||||
typedef uint16_t __attribute__((vector_size(16))) uvec16;
|
||||
typedef uint32_t __attribute__((vector_size(16))) uvec32;
|
||||
@ -634,6 +639,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8;
|
||||
#define SIMD_ALIGNED(var) var
|
||||
typedef int16_t vec16[8];
|
||||
typedef int32_t vec32[4];
|
||||
typedef float vecf32[4];
|
||||
typedef int8_t vec8[16];
|
||||
typedef uint16_t uvec16[8];
|
||||
typedef uint32_t uvec32[4];
|
||||
@ -4256,6 +4262,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
|
||||
void GaussRow_F32_NEON(const float* src, float* dst, int width);
|
||||
void GaussRow_F32_C(const float* src, float* dst, int width);
|
||||
|
||||
void GaussCol_F32_NEON(const float* src0,
|
||||
const float* src1,
|
||||
const float* src2,
|
||||
const float* src3,
|
||||
const float* src4,
|
||||
float* dst,
|
||||
int width);
|
||||
|
||||
void GaussCol_F32_C(const float* src0,
|
||||
const float* src1,
|
||||
const float* src2,
|
||||
const float* src3,
|
||||
const float* src4,
|
||||
float* dst,
|
||||
int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1741
|
||||
#define LIBYUV_VERSION 1742
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -3043,6 +3043,84 @@ int ARGBShuffle(const uint8_t* src_bgra,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Gauss blur a float plane using Gaussian 5x5 filter with
|
||||
// coefficients of 1, 4, 6, 4, 1.
|
||||
// Each destination pixel is a blur of the 5x5
|
||||
// pixels from the source.
|
||||
// Source edges are clamped.
|
||||
// Edge is 2 pixels on each side, and interior is multiple of 4.
|
||||
LIBYUV_API
|
||||
int GaussPlane_F32(const float* src,
|
||||
int src_stride,
|
||||
float* dst,
|
||||
int dst_stride,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*GaussCol_F32)(const float* src0,
|
||||
const float* src1,
|
||||
const float* src2,
|
||||
const float* src3,
|
||||
const float* src4,
|
||||
float* dst,
|
||||
int width) = GaussCol_F32_C;
|
||||
void (*GaussRow_F32)(const float* src, float* dst, int width) = GaussRow_F32_C;
|
||||
if (!src || !dst || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src = src + (height - 1) * src_stride;
|
||||
src_stride = -src_stride;
|
||||
}
|
||||
|
||||
#if defined(HAS_GAUSSCOL_F32_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
GaussCol_F32 = GaussCol_F32_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_GAUSSROW_F32_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
|
||||
GaussRow_F32 = GaussRow_F32_NEON;
|
||||
}
|
||||
#endif
|
||||
{
|
||||
// 2 pixels on each side, but aligned out to 16 bytes.
|
||||
align_buffer_64(rowbuf, (4 + width + 4) * 4);
|
||||
memset(rowbuf, 0, 16);
|
||||
memset(rowbuf + (4 + width) * 4, 0, 16);
|
||||
float* row = (float*)(rowbuf + 16);
|
||||
const float* src0 = src;
|
||||
const float* src1 = src;
|
||||
const float* src2 = src;
|
||||
const float* src3 = src2 + ((height > 1) ? src_stride : 0);
|
||||
const float* src4 = src3 + ((height > 2) ? src_stride: 0);
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
|
||||
GaussCol_F32(src0, src1, src2, src3, src4, row, width);
|
||||
|
||||
// Extrude edge by 2 floats
|
||||
row[-2] = row[-1] = row[0];
|
||||
row[width + 1] = row[width] = row[width - 1];
|
||||
|
||||
GaussRow_F32(row - 2, dst, width);
|
||||
|
||||
src0 = src1;
|
||||
src1 = src2;
|
||||
src2 = src3;
|
||||
src3 = src4;
|
||||
if ((y + 2) < (height - 1)) {
|
||||
src4 += src_stride;
|
||||
}
|
||||
dst += dst_stride;
|
||||
}
|
||||
free_aligned_buffer_64(rowbuf);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Sobel ARGB effect.
|
||||
static int ARGBSobelize(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
|
||||
@ -3358,6 +3358,29 @@ void GaussCol_C(const uint16_t* src0,
|
||||
}
|
||||
}
|
||||
|
||||
void GaussRow_F32_C(const float* src, float* dst, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ =
|
||||
(src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * (1.0f / 256.0f);
|
||||
++src;
|
||||
}
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussCol_F32_C(const float* src0,
|
||||
const float* src1,
|
||||
const float* src2,
|
||||
const float* src3,
|
||||
const float* src4,
|
||||
float* dst,
|
||||
int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
*dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert biplanar NV21 to packed YUV24
|
||||
void NV21ToYUV24Row_C(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
|
||||
@ -2921,6 +2921,82 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussCol_F32_NEON(const float* src0,
|
||||
const float* src1,
|
||||
const float* src2,
|
||||
const float* src3,
|
||||
const float* src4,
|
||||
float* dst,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
|
||||
"ld1 {v2.4s, v3.4s}, [%1], #32 \n"
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"ld1 {v4.4s, v5.4s}, [%2], #32 \n"
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
|
||||
"ld1 {v2.4s, v3.4s}, [%3], #32 \n"
|
||||
"fmla v1.4s, v5.4s, v7.4s \n"
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"ld1 {v4.4s, v5.4s}, [%4], #32 \n"
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
"fadd v0.4s, v0.4s, v4.4s \n" // * 1
|
||||
"fadd v1.4s, v1.4s, v5.4s \n"
|
||||
"subs %w6, %w6, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src0), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(src4), // %4
|
||||
"+r"(dst), // %5
|
||||
"+r"(width) // %6
|
||||
: "r"(&kGaussCoefficients) // %7
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
||||
}
|
||||
|
||||
// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
|
||||
void GaussRow_F32_NEON(const float* src,
|
||||
float* dst,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4\n" // load 12 samples, 5 rows
|
||||
"fadd v0.4s, v0.4s, v1.4s \n" // * 1
|
||||
"ld1 {v4.4s, v5.4s}, [%0], %5 \n"
|
||||
"fadd v1.4s, v1.4s, v2.4s \n"
|
||||
"fmla v0.4s, v4.4s, v7.4s \n" // * 6
|
||||
"ld1 {v2.4s, v3.4s}, [%0], %4 \n"
|
||||
"fmla v1.4s, v5.4s, v7.4s \n"
|
||||
"ld1 {v4.4s, v5.4s}, [%0], %6 \n"
|
||||
"fadd v2.4s, v2.4s, v4.4s \n"
|
||||
"fadd v3.4s, v3.4s, v5.4s \n"
|
||||
"fmla v0.4s, v2.4s, v6.4s \n" // * 4
|
||||
"fmla v1.4s, v3.4s, v6.4s \n"
|
||||
"fmul v0.4s, v0.4s, v8.4s \n" // / 256
|
||||
"fmul v1.4s, v1.4s, v8.4s \n"
|
||||
"subs %w2, %w2, #8 \n" // 8 processed per loop
|
||||
"st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(&kGaussCoefficients), // %3
|
||||
"r"(8LL), // %4
|
||||
"r"(-4LL), // %5
|
||||
"r"(20LL) // %6
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
|
||||
}
|
||||
|
||||
// Convert biplanar NV21 to packed YUV24
|
||||
void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
const uint8_t* src_vu,
|
||||
|
||||
@ -3234,33 +3234,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width);
|
||||
extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width);
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
|
||||
SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]);
|
||||
SIMD_ALIGNED(uint16_t dst_pixels_c[640]);
|
||||
SIMD_ALIGNED(uint16_t dst_pixels_opt[640]);
|
||||
SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]);
|
||||
SIMD_ALIGNED(uint16_t dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]);
|
||||
|
||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 640 + 4; ++i) {
|
||||
for (int i = 0; i < 1280 + 8; ++i) {
|
||||
orig_pixels[i] = i * 256;
|
||||
}
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
|
||||
GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640);
|
||||
GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 640; ++i) {
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
|
||||
@ -3286,48 +3286,127 @@ extern "C" void GaussCol_C(const uint16_t* src0,
|
||||
int width);
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||
SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]);
|
||||
SIMD_ALIGNED(uint32_t dst_pixels_c[640]);
|
||||
SIMD_ALIGNED(uint32_t dst_pixels_opt[640]);
|
||||
SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]);
|
||||
SIMD_ALIGNED(uint32_t dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]);
|
||||
|
||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 640 * 5; ++i) {
|
||||
orig_pixels[i] = i;
|
||||
for (int i = 0; i < 1280 * 5; ++i) {
|
||||
orig_pixels[i] = static_cast<float>(i);
|
||||
}
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
|
||||
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
|
||||
640);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0],
|
||||
1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
|
||||
&orig_pixels[640 * 3], &orig_pixels[640 * 4],
|
||||
&dst_pixels_opt[0], 640);
|
||||
GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
|
||||
&orig_pixels[640 * 3], &orig_pixels[640 * 4],
|
||||
&dst_pixels_opt[0], 640);
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
|
||||
&orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0],
|
||||
640);
|
||||
GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
|
||||
1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 640; ++i) {
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_EQ(dst_pixels_c[0],
|
||||
static_cast<uint32_t>(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 +
|
||||
640 * 4 * 1));
|
||||
EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
|
||||
TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) {
|
||||
SIMD_ALIGNED(float orig_pixels[1280 + 4]);
|
||||
SIMD_ALIGNED(float dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(float dst_pixels_opt[1280]);
|
||||
|
||||
memset(orig_pixels, 0, sizeof(orig_pixels));
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 1280 + 4; ++i) {
|
||||
orig_pixels[i] = static_cast<float>(i);
|
||||
}
|
||||
GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
|
||||
SIMD_ALIGNED(float dst_pixels_c[1280]);
|
||||
SIMD_ALIGNED(float dst_pixels_opt[1280]);
|
||||
align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows
|
||||
float* orig_pixels = reinterpret_cast<float*>(orig_pixels_buf);
|
||||
|
||||
memset(orig_pixels, 0, 1280 * 5 * 4);
|
||||
memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
|
||||
memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
|
||||
|
||||
for (int i = 0; i < 1280 * 5; ++i) {
|
||||
orig_pixels[i] = static_cast<float>(i);
|
||||
}
|
||||
GaussCol_F32_C(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_c[0], 1280);
|
||||
for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
int has_neon = TestCpuFlag(kCpuHasNEON);
|
||||
if (has_neon) {
|
||||
GaussCol_F32_NEON(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
} else {
|
||||
GaussCol_F32_C(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
}
|
||||
#else
|
||||
GaussCol_F32_C(&orig_pixels[0],
|
||||
&orig_pixels[1280],
|
||||
&orig_pixels[1280 * 2],
|
||||
&orig_pixels[1280 * 3],
|
||||
&orig_pixels[1280 * 4],
|
||||
&dst_pixels_opt[0], 1280);
|
||||
#endif
|
||||
}
|
||||
|
||||
for (int i = 0; i < 1280; ++i) {
|
||||
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||
}
|
||||
free_aligned_buffer_page_end(orig_pixels_buf);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, SwapUVRow) {
|
||||
@ -3360,6 +3439,39 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) {
|
||||
free_aligned_buffer_page_end(src_pixels_vu);
|
||||
free_aligned_buffer_page_end(dst_pixels_uv);
|
||||
}
|
||||
#endif
|
||||
#endif // ENABLE_ROW_TESTS
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
|
||||
const int kSize = benchmark_width_ * benchmark_height_ * 4;
|
||||
align_buffer_page_end(orig_pixels, kSize);
|
||||
align_buffer_page_end(dst_pixels_opt, kSize);
|
||||
align_buffer_page_end(dst_pixels_c, kSize);
|
||||
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
|
||||
((float*)(orig_pixels))[i] = (i & 1023) * 3.14f;
|
||||
}
|
||||
memset(dst_pixels_opt, 1, kSize);
|
||||
memset(dst_pixels_c, 2, kSize);
|
||||
|
||||
MaskCpuFlags(disable_cpu_flags_);
|
||||
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
|
||||
(float*)(dst_pixels_c), benchmark_width_,
|
||||
benchmark_width_, benchmark_height_);
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
|
||||
(float*)(dst_pixels_opt), benchmark_width_,
|
||||
benchmark_width_, benchmark_height_);
|
||||
}
|
||||
for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
|
||||
EXPECT_NEAR(((float*)(dst_pixels_c)) [i],
|
||||
((float*)(dst_pixels_opt))[i], 1.f) << i;
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(dst_pixels_c);
|
||||
free_aligned_buffer_page_end(dst_pixels_opt);
|
||||
free_aligned_buffer_page_end(orig_pixels);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user