diff --git a/README.chromium b/README.chromium index ac76a2349..46459dee2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 251 +Version: 252 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 30e7cb5f9..c846cc71c 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -170,6 +170,11 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Get function to add or subtract rows of bytes to a 16 bit buffer. For blur. +typedef void (*AddRow)(const uint8* src, uint16* dst, int width); +AddRow GetAddRow(uint16* dst, int width); +AddRow GetSubRow(uint16* dst, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/scale.h b/include/libyuv/scale.h index be7da448a..7fb421c06 100644 --- a/include/libyuv/scale.h +++ b/include/libyuv/scale.h @@ -66,6 +66,14 @@ int ScaleOffset(const uint8* src, int src_width, int src_height, uint8* dst, int dst_width, int dst_height, int dst_yoffset, bool interpolate); +typedef void (*ARGBBlendRow)(const uint8* src_argb0, + const uint8* src_argb1, + uint8* dst_argb, int width); + +// Get function to Alpha Blend ARGB pixels and store to destination. +ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width); + + // For testing, allow disabling of optimizations. void SetUseReferenceImpl(bool use); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d9437d953..b2e1c29e3 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 251 +#define LIBYUV_VERSION 252 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0fae87118..8cfb98bc7 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -944,6 +944,32 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, return 0; } +// AddRow is useful for summing up rows of an image, when implementing a +// box filter or blur effect. +AddRow GetAddRow(uint16* dst, int width) { + AddRow AddRowF = AddRow_C; +#if defined(HAS_ADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(width, 16)) { + AddRowF = AddRow_SSE2; + } +#endif + return AddRowF; +} + +// SubRow is useful when a sum of rows exists and the caller wants to +// remove a row and add a new row without recomputing the full sum of rows. +AddRow GetSubRow(uint16* dst, int width) { + AddRow SubRowF = SubRow_C; +#if defined(HAS_ADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(width, 16)) { + SubRowF = SubRow_SSE2; + } +#endif + return SubRowF; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row.h b/source/row.h index 98f9f32e4..4ab42987a 100644 --- a/source/row.h +++ b/source/row.h @@ -61,6 +61,7 @@ extern "C" { #define HAS_I444TOARGBROW_SSSE3 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROWUV_SSSE3 +#define HAS_ADDROW_SSE2 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 @@ -152,6 +153,11 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width); void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width); void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width); +void AddRow_SSE2(const uint8* src, uint16* dst, int width); +void SubRow_SSE2(const uint8* src, uint16* dst, int width); +void AddRow_C(const uint8* src, uint16* dst, int width); +void SubRow_C(const uint8* src, uint16* dst, int width); + void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); diff --git a/source/row_common.cc b/source/row_common.cc index 769fe793c..083b79e36 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -386,6 +386,30 @@ void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { } } +void AddRow_C(const uint8* src, uint16* dst, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst[0] += static_cast(src[0]); + dst[1] += static_cast(src[1]); + src += 2; + dst += 2; + } + if (width & 1) { + dst[0] += static_cast(src[0]); + } +} + +void SubRow_C(const uint8* src, uint16* dst, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst[0] -= static_cast(src[0]); + dst[1] -= static_cast(src[1]); + src += 2; + dst += 2; + } + if (width & 1) { + dst[0] -= static_cast(src[0]); + } +} + void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { for (int x = 0; x < width - 1; x += 2) { dst_u[x] = src_uv[0]; diff --git a/source/row_posix.cc b/source/row_posix.cc index fe21b397f..1a8f4fb8f 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1690,6 +1690,68 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, } #endif +#ifdef HAS_ADDROW_SSE2 +// dst and width aligned to 16 +void AddRow_SSE2(const uint8* src, uint16* dst, int width) { + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa (%1),%%xmm0 \n" + "movdqa 0x10(%1),%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} + +// dst and width aligned to 16 +void SubRow_SSE2(const uint8* src, uint16* dst, int width) { + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa (%1),%%xmm0 \n" + "movdqa 0x10(%1),%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "psubusw %%xmm2,%%xmm0 \n" + "psubusw %%xmm3,%%xmm1 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" +#endif + ); +} +#endif // HAS_ADDROW_SSE2 + #ifdef HAS_SPLITUV_SSE2 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( diff --git a/source/row_win.cc b/source/row_win.cc index 2e538e321..171bffad4 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1716,6 +1716,65 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, } #endif +#ifdef HAS_ADDROW_SSE2 +// dst and width aligned to 16 +__declspec(naked) __declspec(align(16)) +void AddRow_SSE2(const uint8* src, uint16* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pxor xmm4, xmm4 + + align 16 + convertloop: + movdqu xmm2, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqa xmm0, [edx] // read first 8 words + movdqa xmm1, [edx + 16] // read next 8 words + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // add 16 words + paddusw xmm1, xmm3 + sub ecx, 16 + movdqa [edx], xmm0 // store 16 words + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void SubRow_SSE2(const uint8* src, uint16* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + pxor xmm4, xmm4 + + align 16 + convertloop: + movdqu xmm2, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqa xmm0, [edx] // read first 8 words + movdqa xmm1, [edx + 16] // read next 8 words + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + psubusw xmm0, xmm2 // sub 16 words + psubusw xmm1, xmm3 + sub ecx, 16 + movdqa [edx], xmm0 // store 16 words + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + jg convertloop + ret + } +} +#endif // HAS_ADDROW_SSE2 + #ifdef HAS_SPLITUV_SSE2 __declspec(naked) __declspec(align(16)) void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 627acdfb7..cb9f736c4 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -193,4 +193,31 @@ TEST_F(libyuvTest, TestAttenuate) { EXPECT_EQ(85, atten_pixels[255][2]); EXPECT_EQ(255, atten_pixels[255][3]); } + +TEST_F(libyuvTest, TestAddRow) { + SIMD_ALIGNED(uint8 orig_pixels[256]); + SIMD_ALIGNED(uint16 added_pixels[256]); + + libyuv::AddRow AddRow = GetAddRow(added_pixels, 256); + libyuv::AddRow SubRow = GetSubRow(added_pixels, 256); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i] = i; + } + memset(added_pixels, 0, sizeof(uint16) * 256); + + AddRow(orig_pixels, added_pixels, 256); + EXPECT_EQ(7u, added_pixels[7]); + EXPECT_EQ(250u, added_pixels[250]); + AddRow(orig_pixels, added_pixels, 256); + EXPECT_EQ(14u, added_pixels[7]); + EXPECT_EQ(500u, added_pixels[250]); + SubRow(orig_pixels, added_pixels, 256); + EXPECT_EQ(7u, added_pixels[7]); + EXPECT_EQ(250u, added_pixels[250]); + + for (int i = 0; i < 1000 * (1280 * 720 * 4 / 256); ++i) { + AddRow(orig_pixels, added_pixels, 256); + } +} }