diff --git a/README.chromium b/README.chromium index b5814b2c5..77317170b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1741 +Version: 1742 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 5299fe2c0..573952629 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -743,6 +743,19 @@ int ARGBBlur(const uint8_t* src_argb, int height, int radius); +// Gaussian 5x5 blur a float plane. +// Coefficients of 1, 4, 6, 4, 1. +// Each destination pixel is a blur of the 5x5 +// pixels from the source. +// Source edges are clamped. +LIBYUV_API +int GaussPlane_F32(const float* src, + int src_stride, + float* dst, + int dst_stride, + int width, + int height); + // Multiply ARGB image by ARGB value. LIBYUV_API int ARGBShade(const uint8_t* src_argb, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b721858f1..46afb0717 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -419,6 +419,9 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_SCALESUMSAMPLES_NEON +#define HAS_GAUSSROW_F32_NEON +#define HAS_GAUSSCOL_F32_NEON + #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_ABGRTOUVROW_MSA @@ -601,6 +604,7 @@ extern "C" { #endif typedef __declspec(align(16)) int16_t vec16[8]; typedef __declspec(align(16)) int32_t vec32[4]; +typedef __declspec(align(16)) float vecf32[4]; typedef __declspec(align(16)) int8_t vec8[16]; typedef __declspec(align(16)) uint16_t uvec16[8]; typedef __declspec(align(16)) uint32_t uvec32[4]; @@ -620,6 +624,7 @@ typedef __declspec(align(32)) uint8_t ulvec8[32]; #endif typedef int16_t __attribute__((vector_size(16))) vec16; typedef int32_t __attribute__((vector_size(16))) vec32; +typedef float __attribute__((vector_size(16))) vecf32; typedef int8_t __attribute__((vector_size(16))) vec8; typedef uint16_t __attribute__((vector_size(16))) uvec16; typedef uint32_t __attribute__((vector_size(16))) uvec32; @@ -634,6 +639,7 @@ typedef uint8_t __attribute__((vector_size(32))) ulvec8; #define SIMD_ALIGNED(var) var typedef int16_t vec16[8]; typedef int32_t vec32[4]; +typedef float vecf32[4]; typedef int8_t vec8[16]; typedef uint16_t uvec16[8]; typedef uint32_t uvec32[4]; @@ -4256,6 +4262,25 @@ void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr, const struct YuvConstants* yuvconstants, int width); +void GaussRow_F32_NEON(const float* src, float* dst, int width); +void GaussRow_F32_C(const float* src, float* dst, int width); + +void GaussCol_F32_NEON(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width); + +void GaussCol_F32_C(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4c446ba3d..9c3bb5aba 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1741 +#define LIBYUV_VERSION 1742 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index f7cb4f8fe..7e7e6e35d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -3043,6 +3043,84 @@ int ARGBShuffle(const uint8_t* src_bgra, return 0; } +// Gauss blur a float plane using Gaussian 5x5 filter with +// coefficients of 1, 4, 6, 4, 1. +// Each destination pixel is a blur of the 5x5 +// pixels from the source. +// Source edges are clamped. +// Edge is 2 pixels on each side, and interior is multiple of 4. +LIBYUV_API +int GaussPlane_F32(const float* src, + int src_stride, + float* dst, + int dst_stride, + int width, + int height) { + int y; + void (*GaussCol_F32)(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) = GaussCol_F32_C; + void (*GaussRow_F32)(const float* src, float* dst, int width) = GaussRow_F32_C; + if (!src || !dst || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src = src + (height - 1) * src_stride; + src_stride = -src_stride; + } + +#if defined(HAS_GAUSSCOL_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussCol_F32 = GaussCol_F32_NEON; + } +#endif +#if defined(HAS_GAUSSROW_F32_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + GaussRow_F32 = GaussRow_F32_NEON; + } +#endif + { + // 2 pixels on each side, but aligned out to 16 bytes. + align_buffer_64(rowbuf, (4 + width + 4) * 4); + memset(rowbuf, 0, 16); + memset(rowbuf + (4 + width) * 4, 0, 16); + float* row = (float*)(rowbuf + 16); + const float* src0 = src; + const float* src1 = src; + const float* src2 = src; + const float* src3 = src2 + ((height > 1) ? src_stride : 0); + const float* src4 = src3 + ((height > 2) ? src_stride: 0); + + for (y = 0; y < height; ++y) { + + GaussCol_F32(src0, src1, src2, src3, src4, row, width); + + // Extrude edge by 2 floats + row[-2] = row[-1] = row[0]; + row[width + 1] = row[width] = row[width - 1]; + + GaussRow_F32(row - 2, dst, width); + + src0 = src1; + src1 = src2; + src2 = src3; + src3 = src4; + if ((y + 2) < (height - 1)) { + src4 += src_stride; + } + dst += dst_stride; + } + free_aligned_buffer_64(rowbuf); + } + return 0; +} + // Sobel ARGB effect. static int ARGBSobelize(const uint8_t* src_argb, int src_stride_argb, diff --git a/source/row_common.cc b/source/row_common.cc index 70aa2e13c..15f00e886 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -3358,6 +3358,29 @@ void GaussCol_C(const uint16_t* src0, } } +void GaussRow_F32_C(const float* src, float* dst, int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = + (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * (1.0f / 256.0f); + ++src; + } +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_C(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + int i; + for (i = 0; i < width; ++i) { + *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; + } +} + // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_C(const uint8_t* src_y, const uint8_t* src_vu, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 5646da8a2..8223b4737 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2921,6 +2921,82 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } +static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussCol_F32_NEON(const float* src0, + const float* src1, + const float* src2, + const float* src3, + const float* src4, + float* dst, + int width) { + asm volatile( + "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 + + "1: \n" + "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows + "ld1 {v2.4s, v3.4s}, [%1], #32 \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%2], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%3], #32 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "ld1 {v4.4s, v5.4s}, [%4], #32 \n" + "fmla v1.4s, v3.4s, v6.4s \n" + "fadd v0.4s, v0.4s, v4.4s \n" // * 1 + "fadd v1.4s, v1.4s, v5.4s \n" + "subs %w6, %w6, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : "r"(&kGaussCoefficients) // %7 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} + +// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. +void GaussRow_F32_NEON(const float* src, + float* dst, + int width) { + asm volatile( + "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 + + "1: \n" + "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4\n" // load 12 samples, 5 rows + "fadd v0.4s, v0.4s, v1.4s \n" // * 1 + "ld1 {v4.4s, v5.4s}, [%0], %5 \n" + "fadd v1.4s, v1.4s, v2.4s \n" + "fmla v0.4s, v4.4s, v7.4s \n" // * 6 + "ld1 {v2.4s, v3.4s}, [%0], %4 \n" + "fmla v1.4s, v5.4s, v7.4s \n" + "ld1 {v4.4s, v5.4s}, [%0], %6 \n" + "fadd v2.4s, v2.4s, v4.4s \n" + "fadd v3.4s, v3.4s, v5.4s \n" + "fmla v0.4s, v2.4s, v6.4s \n" // * 4 + "fmla v1.4s, v3.4s, v6.4s \n" + "fmul v0.4s, v0.4s, v8.4s \n" // / 256 + "fmul v1.4s, v1.4s, v8.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kGaussCoefficients), // %3 + "r"(8LL), // %4 + "r"(-4LL), // %5 + "r"(20LL) // %6 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); +} + // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 02cd1fbc3..c75f715af 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3234,33 +3234,33 @@ extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width); extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width); TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { - SIMD_ALIGNED(uint32_t orig_pixels[640 + 4]); - SIMD_ALIGNED(uint16_t dst_pixels_c[640]); - SIMD_ALIGNED(uint16_t dst_pixels_opt[640]); + SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]); + SIMD_ALIGNED(uint16_t dst_pixels_c[1280]); + SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); - for (int i = 0; i < 640 + 4; ++i) { + for (int i = 0; i < 1280 + 8; ++i) { orig_pixels[i] = i * 256; } - GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640); - for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) { + GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { - GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640); + GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); } else { - GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640); + GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); } #else - GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 640); + GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); #endif } - for (int i = 0; i < 640; ++i) { + for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } @@ -3286,48 +3286,127 @@ extern "C" void GaussCol_C(const uint16_t* src0, int width); TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { - SIMD_ALIGNED(uint16_t orig_pixels[640 * 5]); - SIMD_ALIGNED(uint32_t dst_pixels_c[640]); - SIMD_ALIGNED(uint32_t dst_pixels_opt[640]); + SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]); + SIMD_ALIGNED(uint32_t dst_pixels_c[1280]); + SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); - for (int i = 0; i < 640 * 5; ++i) { - orig_pixels[i] = i; + for (int i = 0; i < 1280 * 5; ++i) { + orig_pixels[i] = static_cast(i); } - GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0], - 640); - for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) { + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0], + 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { - GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], - &dst_pixels_opt[0], 640); + GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); } else { - GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], - &dst_pixels_opt[0], 640); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); } #else - GaussCol_C(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2], - &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_opt[0], - 640); + GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], + 1280); #endif } - for (int i = 0; i < 640; ++i) { + for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } +} - EXPECT_EQ(dst_pixels_c[0], - static_cast(0 * 1 + 640 * 4 + 640 * 2 * 6 + 640 * 3 * 4 + - 640 * 4 * 1)); - EXPECT_EQ(dst_pixels_c[639], static_cast(30704)); +TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) { + SIMD_ALIGNED(float orig_pixels[1280 + 4]); + SIMD_ALIGNED(float dst_pixels_c[1280]); + SIMD_ALIGNED(float dst_pixels_opt[1280]); + + memset(orig_pixels, 0, sizeof(orig_pixels)); + memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); + memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); + + for (int i = 0; i < 1280 + 4; ++i) { + orig_pixels[i] = static_cast(i); + } + GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); + } else { + GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); + } +#else + GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); +#endif + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } +} + +TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) { + SIMD_ALIGNED(float dst_pixels_c[1280]); + SIMD_ALIGNED(float dst_pixels_opt[1280]); + align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows + float* orig_pixels = reinterpret_cast(orig_pixels_buf); + + memset(orig_pixels, 0, 1280 * 5 * 4); + memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); + memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); + + for (int i = 0; i < 1280 * 5; ++i) { + orig_pixels[i] = static_cast(i); + } + GaussCol_F32_C(&orig_pixels[0], + &orig_pixels[1280], + &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], + &orig_pixels[1280 * 4], + &dst_pixels_c[0], 1280); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { +#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) + int has_neon = TestCpuFlag(kCpuHasNEON); + if (has_neon) { + GaussCol_F32_NEON(&orig_pixels[0], + &orig_pixels[1280], + &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], + &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); + } else { + GaussCol_F32_C(&orig_pixels[0], + &orig_pixels[1280], + &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], + &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); + } +#else + GaussCol_F32_C(&orig_pixels[0], + &orig_pixels[1280], + &orig_pixels[1280 * 2], + &orig_pixels[1280 * 3], + &orig_pixels[1280 * 4], + &dst_pixels_opt[0], 1280); +#endif + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + free_aligned_buffer_page_end(orig_pixels_buf); } TEST_F(LibYUVPlanarTest, SwapUVRow) { @@ -3360,6 +3439,39 @@ TEST_F(LibYUVPlanarTest, SwapUVRow) { free_aligned_buffer_page_end(src_pixels_vu); free_aligned_buffer_page_end(dst_pixels_uv); } -#endif +#endif // ENABLE_ROW_TESTS + +TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) { + const int kSize = benchmark_width_ * benchmark_height_ * 4; + align_buffer_page_end(orig_pixels, kSize); + align_buffer_page_end(dst_pixels_opt, kSize); + align_buffer_page_end(dst_pixels_c, kSize); + + for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { + ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f; + } + memset(dst_pixels_opt, 1, kSize); + memset(dst_pixels_c, 2, kSize); + + MaskCpuFlags(disable_cpu_flags_); + GaussPlane_F32((const float*)(orig_pixels), benchmark_width_, + (float*)(dst_pixels_c), benchmark_width_, + benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + GaussPlane_F32((const float*)(orig_pixels), benchmark_width_, + (float*)(dst_pixels_opt), benchmark_width_, + benchmark_width_, benchmark_height_); + } + for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) { + EXPECT_NEAR(((float*)(dst_pixels_c)) [i], + ((float*)(dst_pixels_opt))[i], 1.f) << i; + } + + free_aligned_buffer_page_end(dst_pixels_c); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(orig_pixels); +} } // namespace libyuv