From c56a55fc7206a257eecc21969f94ab066dd80f2f Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 4 Apr 2013 18:33:44 +0000 Subject: [PATCH] Sobel and SobelXY Neon port. Improved Bayer - did 8 at time version, and specialized G channel version. BUG=201 TEST=libyuvTest.TestSobel and libyuvTest.TestSobelXY Review URL: https://webrtc-codereview.appspot.com/1279006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@642 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 35 ++++++--- include/libyuv/version.h | 2 +- source/format_conversion.cc | 8 +- source/planar_functions.cc | 38 +++++++++- source/row_any.cc | 2 +- source/row_neon.cc | 142 ++++++++++++++++++++++++++++++++++-- unit_test/planar_test.cc | 20 +++++ 8 files changed, 222 insertions(+), 27 deletions(-) diff --git a/README.chromium b/README.chromium index 816d70514..e263c2cf6 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 641 +Version: 642 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 84bbbba46..fc31c8eac 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -197,6 +197,7 @@ extern "C" { #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565ROW_NEON @@ -269,6 +270,10 @@ extern "C" { #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON +#define HAS_SOBELROW_NEON +#define HAS_SOBELXYROW_NEON +#define HAS_SOBELXROW_NEON +#define HAS_SOBELYROW_NEON #endif // The following are available on Mips platforms @@ -1315,16 +1320,18 @@ void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix); -void ARGBToBayerRow_C(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix); -void ARGBToBayerRow_SSSE3(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix); -void ARGBToBayerRow_NEON(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix); -void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix); -void ARGBToBayerRow_Any_NEON(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix); +void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix); +void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, + uint32 /* selector */, int pix); void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, @@ -1459,18 +1466,26 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width); void SobelXRow_SSSE3(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width); +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width); void SobelYRow_C(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width); void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width); +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width); void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 93d8adda0..933031adc 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 641 +#define LIBYUV_VERSION 642 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 53955f715..d2e773f3d 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -80,9 +80,9 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb, } } #elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_NEON; } } @@ -437,9 +437,9 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, } } #elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_NEON; } } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 77af629a1..72dff8b65 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1769,9 +1769,9 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, } } #elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_NEON; } } @@ -1782,6 +1782,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3)) { SobelYRow = SobelYRow_SSSE3; } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } #endif void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobely, int width) = @@ -1790,6 +1795,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3)) { SobelXRow = SobelXRow_SSSE3; } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } #endif void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; @@ -1799,6 +1809,11 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, SobelRow = SobelRow_SSE2; } #endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } +#endif const int kEdge = 16; // Extra pixels at start of row for extrude/align. SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]); @@ -1868,9 +1883,9 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, } } #elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_NEON; } } @@ -1881,6 +1896,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3)) { SobelYRow = SobelYRow_SSSE3; } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } #endif void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobely, int width) = @@ -1889,6 +1909,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSSE3)) { SobelXRow = SobelXRow_SSSE3; } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } #endif void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; @@ -1898,6 +1923,11 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, SobelXYRow = SobelXYRow_SSE2; } #endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } +#endif const int kEdge = 16; // Extra pixels at start of row for extrude/align. SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]); diff --git a/source/row_any.cc b/source/row_any.cc index 6c0d4f4a3..7e042d603 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -190,7 +190,7 @@ BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, #endif #if defined(HAS_ARGBTOBAYERROW_NEON) BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, - 3, 4, 1) + 7, 4, 1) #endif #undef BAYERANY diff --git a/source/row_neon.cc b/source/row_neon.cc index 4796ae3b4..6f075a84d 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1176,18 +1176,20 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( - "vmov.u32 d2[0], %3 \n" // selector + "vmov.u32 d6[0], %3 \n" // selector "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load row 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d3, {d0, d1}, d2 \n" // look up 4 pixels - "vst1.u32 {d3[0]}, [%1]! \n" // store 4. + "vld1.u8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels + "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels + "vtrn.u32 d4, d5 \n" // combine 8 pixels + "vst1.u8 {d4}, [%1]! \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : "r"(selector) // %3 - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } @@ -2595,6 +2597,134 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, ); } +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. +// A = 255 +// R = Sobel +// G = Sobel +// B = Sobel +void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d1}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d0, d0, d1 \n" // add + "vmov.u8 d1, d0 \n" + "vmov.u8 d2, d0 \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// Mixes Sobel X, Sobel Y and Sobel into ARGB. +// A = 255 +// R = Sobel X +// G = Sobel +// B = Sobel Y +void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d3, #255 \n" // alpha + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + "vld1.8 {d0}, [%1]! \n" // load 8 sobely. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d1, d0, d2 \n" // add + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(src_sobelx), // %0 + "+r"(src_sobely), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1" + ); +} + +// SobelX as a matrix is +// -1 0 1 +// -2 0 2 +// -1 0 1 +void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, + const uint8* src_y2, uint8* dst_sobelx, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.u8 {d0}, [%0],%5 \n" // top + "vld1.u8 {d1}, [%0],%6 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.u8 {d2}, [%1],%5 \n" // center * 2 + "vld1.u8 {d3}, [%1],%6 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.u8 {d2}, [%2],%5 \n" // bottom + "vld1.u8 {d3}, [%2],%6 \n" + "subs %4, %4, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.u8 {d0}, [%3]! \n" // store 8 sobelx + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(src_y2), // %2 + "+r"(dst_sobelx), // %3 + "+r"(width) // %4 + : "r"(2), // %5 + "r"(6) // %6 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} + +// SobelY as a matrix is +// -1 -2 -1 +// 0 0 0 +// 1 2 1 +void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, + uint8* dst_sobely, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld1.u8 {d0}, [%0],%4 \n" // left + "vld1.u8 {d1}, [%1],%4 \n" + "vsubl.u8 q0, d0, d1 \n" + "vld1.u8 {d2}, [%0],%4 \n" // center * 2 + "vld1.u8 {d3}, [%1],%4 \n" + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vadd.s16 q0, q0, q1 \n" + "vld1.u8 {d2}, [%0],%5 \n" // right + "vld1.u8 {d3}, [%1],%5 \n" + "subs %3, %3, #8 \n" // 8 pixels + "vsubl.u8 q1, d2, d3 \n" + "vadd.s16 q0, q0, q1 \n" + "vabs.s16 q0, q0 \n" + "vqmovn.u16 d0, q0 \n" + "vst1.u8 {d0}, [%2]! \n" // store 8 sobely + "bgt 1b \n" + : "+r"(src_y0), // %0 + "+r"(src_y1), // %1 + "+r"(dst_sobely), // %2 + "+r"(width) // %3 + : "r"(1), // %4 + "r"(6) // %5 + : "cc", "memory", "q0", "q1" // Clobber List + ); +} #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index efcd33dc0..469647667 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -977,6 +977,11 @@ TEST_F(libyuvTest, TestSobelX) { if (TestCpuFlag(kCpuHasSSSE3)) { SobelXRow = SobelXRow_SSSE3; } +#endif +#if defined(HAS_SOBELXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXRow = SobelXRow_NEON; + } #endif for (int i = 0; i < benchmark_pixels_div256_; ++i) { SobelXRow(orig_pixels_0, orig_pixels_1, orig_pixels_2, @@ -1009,6 +1014,11 @@ TEST_F(libyuvTest, TestSobelY) { if (TestCpuFlag(kCpuHasSSSE3)) { SobelYRow = SobelYRow_SSSE3; } +#endif +#if defined(HAS_SOBELYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelYRow = SobelYRow_NEON; + } #endif for (int i = 0; i < benchmark_pixels_div256_; ++i) { SobelYRow(orig_pixels_0, orig_pixels_1, sobel_pixels_opt, 256); @@ -1048,6 +1058,11 @@ TEST_F(libyuvTest, TestSobel) { if (TestCpuFlag(kCpuHasSSE2)) { SobelRow = SobelRow_SSE2; } +#endif +#if defined(HAS_SOBELROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelRow = SobelRow_NEON; + } #endif for (int i = 0; i < benchmark_pixels_div256_; ++i) { SobelRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256); @@ -1083,6 +1098,11 @@ TEST_F(libyuvTest, TestSobelXY) { if (TestCpuFlag(kCpuHasSSE2)) { SobelXYRow = SobelXYRow_SSE2; } +#endif +#if defined(HAS_SOBELXYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SobelXYRow = SobelXYRow_NEON; + } #endif for (int i = 0; i < benchmark_pixels_div256_; ++i) { SobelXYRow(orig_sobelx, orig_sobely, sobel_pixels_opt, 256);