diff --git a/README.chromium b/README.chromium index 95f53e026..f9943f523 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 607 +Version: 608 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7a18221b0..16df7084e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -120,6 +120,8 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_COPYROW_AVX2 +#define HAS_SOBELXROW_SSSE3 +#define HAS_SOBELYROW_SSSE3 // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 // TODO(fbarchard): Hook these up to all functions. e.g. format conversion. @@ -1419,6 +1421,16 @@ void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride_argb, int dst_width, int source_y_fraction); +// Sobel images. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width); +void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); +void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 66ab48d2f..3de4f02bb 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 607 +#define LIBYUV_VERSION 608 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_common.cc b/source/row_common.cc index 8490846dd..2ba95214e 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -712,6 +712,53 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1, } #undef SHADE +// Sobel functions which mimics SSSE3. +void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, + uint8* dst_sobelx, int width) { + for (int i = 0; i < width; ++i) { + int a = src_y0[i]; + int b = src_y1[i]; + int c = src_y2[i]; + int a_sub = src_y0[i + 2]; + int b_sub = src_y1[i + 2]; + int c_sub = src_y2[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = a_diff + b_diff * 2 + c_diff; + if (sobel < 0) { + sobel = -sobel; + } + if (sobel > 255) { + sobel = 255; + } + dst_sobelx[i] = static_cast(sobel); + } +} + +void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + for (int i = 0; i < width; ++i) { + int a = src_y0[i + 0]; + int b = src_y0[i + 1]; + int c = src_y0[i + 2]; + int a_sub = src_y1[i + 0]; + int b_sub = src_y1[i + 1]; + int c_sub = src_y1[i + 2]; + int a_diff = a - a_sub; + int b_diff = b - b_sub; + int c_diff = c - c_sub; + int sobel = a_diff + b_diff * 2 + c_diff; + if (sobel < 0) { + sobel = -sobel; + } + if (sobel > 255) { + sobel = 255; + } + dst_sobely[i] = static_cast(sobel); + } +} + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { diff --git a/source/row_win.cc b/source/row_win.cc index c1105d2e0..57a69bae8 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5027,6 +5027,112 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBSUBTRACTROW_AVX2 +#ifdef HAS_SOBELXROW_SSSE3 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +__declspec(naked) __declspec(align(16)) +void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 + mov edi, [esp + 8 + 12] // src_y2 + mov edx, [esp + 8 + 16] // dst_sobelx + mov ecx, [esp + 8 + 20] // width + sub esi, eax + sub edi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + align 16 + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead. + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + jg convertloop + + pop edi + pop esi + ret + } +} +#endif // HAS_SOBELXROW_SSSE3 + +#ifdef HAS_SOBELYROW_SSSE3 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +__declspec(naked) __declspec(align(16)) +void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 + mov edx, [esp + 4 + 12] // dst_sobely + mov ecx, [esp + 4 + 16] // width + sub esi, eax + sub edx, eax + pxor xmm5, xmm5 // constant 0 + + align 16 + convertloop: + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + psubw xmm0, xmm1 + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + psubw xmm1, xmm2 + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + psubw xmm2, xmm3 + paddw xmm0, xmm2 + paddw xmm0, xmm1 + paddw xmm0, xmm1 + pabsw xmm0, xmm0 // SSSE3. Could use SSE2 psubusw twice instead. + packuswb xmm0, xmm0 + sub ecx, 8 + movq qword ptr [eax + edx], xmm0 + lea eax, [eax + 8] + jg convertloop + + pop esi + ret + } +} +#endif // HAS_SOBELYROW_SSSE3 + #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index a832045f2..886b36dd1 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -20,6 +20,7 @@ #include "libyuv/format_conversion.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "libyuv/row.h" // For Sobel #include "../unit_test/unit_test.h" #if defined(_MSC_VER) @@ -890,6 +891,77 @@ TEST_F(libyuvTest, TestAffine) { #endif } +TEST_F(libyuvTest, TestSobelX) { + SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]); + SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]); + SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]); + SIMD_ALIGNED(uint8 sobel_pixels_c[256]); + + for (int i = 0; i < 256 + 2; ++i) { + orig_pixels_0[i] = i; + orig_pixels_1[i] = i * 2; + orig_pixels_2[i] = i * 3; + } + + SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2, + sobel_pixels_c, 256); + + EXPECT_EQ(16u, sobel_pixels_c[0]); + EXPECT_EQ(16u, sobel_pixels_c[100]); + EXPECT_EQ(255u, sobel_pixels_c[255]); +#if defined(HAS_SOBELXROW_SSSE3) + SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelXRow_SSSE3(orig_pixels_0, orig_pixels_1, orig_pixels_2, + sobel_pixels_opt, 256); + } + } else { + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2, + sobel_pixels_opt, 256); + } + } + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); + } +#endif +} + +TEST_F(libyuvTest, TestSobelY) { + SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]); + SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]); + SIMD_ALIGNED(uint8 sobel_pixels_c[256]); + + for (int i = 0; i < 256 + 2; ++i) { + orig_pixels_0[i] = i; + orig_pixels_1[i] = i * 2; + } + + SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_c, 256); + + EXPECT_EQ(4u, sobel_pixels_c[0]); + EXPECT_EQ(255u, sobel_pixels_c[100]); + EXPECT_EQ(0u, sobel_pixels_c[255]); +#if defined(HAS_SOBELYROW_SSSE3) + SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + if (has_ssse3) { + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelYRow_SSSE3(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); + } + } else { + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); + } + } + for (int i = 0; i < 256; ++i) { + EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); + } +#endif +} + TEST_F(libyuvTest, TestCopyPlane) { int err = 0; int yw = benchmark_width_;