From 9d48df9ac434cf036bd8ec648fc040756a382c60 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Sun, 24 Mar 2013 20:12:25 +0000 Subject: [PATCH] Sobel port to posix. Improved unittest for C version. BUG=201 TESTED=try bots Review URL: https://webrtc-codereview.appspot.com/1242004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@619 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 8 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 1 - source/row_posix.cc | 211 +++++++++++++++++++++++++++++++++++++ unit_test/planar_test.cc | 85 +++++++-------- 6 files changed, 256 insertions(+), 53 deletions(-) diff --git a/README.chromium b/README.chromium index a678b67f0..31fc32edf 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 618 +Version: 619 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d5a0452b7..0d3848974 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -118,6 +118,10 @@ extern "C" { #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 +#define HAS_SOBELROW_SSE2 +#define HAS_SOBELXROW_SSSE3 +#define HAS_SOBELXYROW_SSE2 +#define HAS_SOBELYROW_SSSE3 #endif // The following are Windows only. @@ -125,10 +129,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_COPYROW_AVX2 -#define HAS_SOBELXROW_SSSE3 -#define HAS_SOBELYROW_SSSE3 -#define HAS_SOBELROW_SSE2 -#define HAS_SOBELXYROW_SSE2 // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 // TODO(fbarchard): Hook these up to all functions. e.g. format conversion. diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 42062b779..93df7444d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 618 +#define LIBYUV_VERSION 619 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 68567077b..d88dc60c3 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1970,7 +1970,6 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, SobelXRow = SobelXRow_SSSE3; } #endif - void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) diff --git a/source/row_posix.cc b/source/row_posix.cc index 471f94585..e26303e80 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4067,6 +4067,217 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBSUBTRACTROW_SSE2 +#ifdef HAS_SOBELXROW_SSSE3 +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "sub %0,%3 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq 0x2(%0),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq (%0,%1,1),%%xmm1 \n" + "movq 0x2(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq (%0,%2,1),%%xmm2 \n" + "movq 0x2(%0,%2,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pabsw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x8,%4 \n" + "movq %%xmm0,(%0,%3,1) \n" + "lea 0x8(%0),%0 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELXROW_SSSE3 + +#ifdef HAS_SOBELYROW_SSSE3 +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + "sub %0,%1 \n" + "sub %0,%2 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%1,1),%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "psubw %%xmm1,%%xmm0 \n" + "movq 0x1(%0),%%xmm1 \n" + "movq 0x1(%0,%1,1),%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "psubw %%xmm2,%%xmm1 \n" + "movq 0x2(%0),%%xmm2 \n" + "movq 0x2(%0,%1,1),%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "psubw %%xmm3,%%xmm2 \n" + "paddw %%xmm2,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "paddw %%xmm1,%%xmm0 \n" + "pabsw %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x8,%3 \n" + "movq %%xmm0,(%0,%2,1) \n" + "lea 0x8(%0),%0 \n" + "jg 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELYROW_SSSE3 + +#ifdef HAS_SOBELROW_SSE2 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm0,%%xmm2 \n" + "punpckhbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm1 \n" + "punpckhwd %%xmm2,%%xmm2 \n" + "por %%xmm5,%%xmm1 \n" + "por %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklwd %%xmm0,%%xmm3 \n" + "punpckhwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm3 \n" + "por %%xmm5,%%xmm0 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm1,(%2) \n" + "movdqa %%xmm2,0x10(%2) \n" + "movdqa %%xmm3,0x20(%2) \n" + "movdqa %%xmm0,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" +#endif + ); +} +#endif // HAS_SOBELROW_SSE2 + +#ifdef HAS_SOBELXYROW_SSE2 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + // 8 pixel loop. + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "paddusb %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" + "punpcklbw %%xmm5,%%xmm3 \n" + "punpckhbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm1,%%xmm4 \n" + "punpcklbw %%xmm2,%%xmm4 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm4,%%xmm6 \n" + "punpcklwd %%xmm3,%%xmm6 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "movdqa %%xmm1,%%xmm7 \n" + "punpcklwd %%xmm0,%%xmm7 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "sub $0x10,%3 \n" + "movdqa %%xmm6,(%2) \n" + "movdqa %%xmm4,0x10(%2) \n" + "movdqa %%xmm7,0x20(%2) \n" + "movdqa %%xmm1,0x30(%2) \n" + "lea 0x40(%2),%2 \n" + "jg 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_SOBELXYROW_SSE2 + #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 029e2d03b..94284b66b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -896,6 +896,7 @@ TEST_F(libyuvTest, TestSobelX) { SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]); SIMD_ALIGNED(uint8 orig_pixels_2[256 + 2]); SIMD_ALIGNED(uint8 sobel_pixels_c[256]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); for (int i = 0; i < 256 + 2; ++i) { orig_pixels_0[i] = i; @@ -909,30 +910,29 @@ TEST_F(libyuvTest, TestSobelX) { EXPECT_EQ(16u, sobel_pixels_c[0]); EXPECT_EQ(16u, sobel_pixels_c[100]); EXPECT_EQ(255u, sobel_pixels_c[255]); + + void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobely, int width) = + SobelXRow_C; #if defined(HAS_SOBELXROW_SSSE3) - SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelXRow_SSSE3(orig_pixels_0, orig_pixels_1, orig_pixels_2, - sobel_pixels_opt, 256); - } - } else { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelXRow_C(orig_pixels_0, orig_pixels_1, orig_pixels_2, - sobel_pixels_opt, 256); - } + if (TestCpuFlag(kCpuHasSSSE3)) { + SobelXRow = SobelXRow_SSSE3; + } +#endif + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2, + sobel_pixels_opt, 256); } for (int i = 0; i < 256; ++i) { EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); } -#endif } TEST_F(libyuvTest, TestSobelY) { SIMD_ALIGNED(uint8 orig_pixels_0[256 + 2]); SIMD_ALIGNED(uint8 orig_pixels_1[256 + 2]); SIMD_ALIGNED(uint8 sobel_pixels_c[256]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); for (int i = 0; i < 256 + 2; ++i) { orig_pixels_0[i] = i; @@ -944,28 +944,26 @@ TEST_F(libyuvTest, TestSobelY) { EXPECT_EQ(4u, sobel_pixels_c[0]); EXPECT_EQ(255u, sobel_pixels_c[100]); EXPECT_EQ(0u, sobel_pixels_c[255]); + void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) = SobelYRow_C; #if defined(HAS_SOBELYROW_SSSE3) - SIMD_ALIGNED(uint8 sobel_pixels_opt[256]); - int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); - if (has_ssse3) { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelYRow_SSSE3(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); - } - } else { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelYRow_C(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); - } + if (TestCpuFlag(kCpuHasSSSE3)) { + SobelYRow = SobelYRow_SSSE3; + } +#endif + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); } for (int i = 0; i < 256; ++i) { EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); } -#endif } TEST_F(libyuvTest, TestSobel) { SIMD_ALIGNED(uint8 orig_sobelx[256]); SIMD_ALIGNED(uint8 orig_sobely[256]); SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]); for (int i = 0; i < 256; ++i) { orig_sobelx[i] = i; @@ -985,28 +983,26 @@ TEST_F(libyuvTest, TestSobel) { EXPECT_EQ(255u, sobel_pixels_c[7]); EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]); EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]); + void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) - SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - if (has_sse2) { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelRow_SSE2(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); - } - } else { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelRow_C(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); - } + if (TestCpuFlag(kCpuHasSSE2)) { + SobelRow = SobelRow_SSE2; + } +#endif + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); } for (int i = 0; i < 16; ++i) { EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); } -#endif } TEST_F(libyuvTest, TestSobelXY) { SIMD_ALIGNED(uint8 orig_sobelx[256]); SIMD_ALIGNED(uint8 orig_sobely[256]); SIMD_ALIGNED(uint8 sobel_pixels_c[256 * 4]); + SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]); for (int i = 0; i < 256; ++i) { orig_sobelx[i] = i; @@ -1022,22 +1018,19 @@ TEST_F(libyuvTest, TestSobelXY) { EXPECT_EQ(255u, sobel_pixels_c[7]); EXPECT_EQ(255u, sobel_pixels_c[100 * 4 + 1]); EXPECT_EQ(255u, sobel_pixels_c[255 * 4 + 1]); + void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) - SIMD_ALIGNED(uint8 sobel_pixels_opt[256 * 4]); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); - if (has_sse2) { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelXYRow_SSE2(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); - } - } else { - for (int i = 0; i < benchmark_pixels_div256_; ++i) { - SobelXYRow_C(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); - } + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXYRow = SobelXYRow_SSE2; + } +#endif + for (int i = 0; i < benchmark_pixels_div256_; ++i) { + SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); } for (int i = 0; i < 16; ++i) { EXPECT_EQ(sobel_pixels_opt[i], sobel_pixels_c[i]); } -#endif } TEST_F(libyuvTest, TestCopyPlane) {