From ea8d0eb0d1a21555701351dd64d018c74004ca0b Mon Sep 17 00:00:00 2001 From: "frkoenig@google.com" Date: Wed, 4 Jan 2012 23:04:07 +0000 Subject: [PATCH] ScaleFilterRows optimized for NEON. Includes unit test that scales the image up by 2. Currently this is done using the generic bilinear scale. Review URL: http://webrtc-codereview.appspot.com/330032 git-svn-id: http://libyuv.googlecode.com/svn/trunk@126 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- source/scale.cc | 61 +++++++++++++++++++++++++++++++++++++++++ unit_test/scale_test.cc | 32 +++++++++++++++------ 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/README.chromium b/README.chromium index 2065c316f..6fa90e921 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 125 +Version: 126 License: BSD License File: LICENSE diff --git a/source/scale.cc b/source/scale.cc index 7db189984..cba7db7ed 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -502,6 +502,61 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, ); } +// 16x2 -> 16x1 +#define HAS_SCALEFILTERROWS_NEON +static void ScaleFilterRows_NEON(uint8* dst_ptr, + const uint8* src_ptr, int src_stride, + int dst_width, int source_y_fraction) { + asm volatile ( + "cmp %4, #0 \n" + "beq 2f \n" + "add %2, %1 \n" + "cmp %4, #128 \n" + "beq 3f \n" + + "vdup.8 d5, %4 \n" + "rsb %4, #256 \n" + "vdup.8 d4, %4 \n" + "1: \n" + "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" + "subs %3, #16 \n" + "vmull.u8 q13, d0, d4 \n" + "vmull.u8 q14, d1, d4 \n" + "vmlal.u8 q13, d2, d5 \n" + "vmlal.u8 q14, d3, d5 \n" + "vrshrn.u16 d0, q13, #8 \n" + "vrshrn.u16 d1, q14, #8 \n" + "vst1.u8 {q0}, [%0]! \n" + "bhi 1b \n" + "b 4f \n" + + "2: \n" + "vld1.u8 {q0}, [%1]! \n" + "subs %3, #16 \n" + "vst1.u8 {q0}, [%0]! \n" + "bhi 2b \n" + "b 4f \n" + + "3: \n" + "vld1.u8 {q0}, [%1]! \n" + "vld1.u8 {q1}, [%2]! \n" + "subs %3, #16 \n" + "vrhadd.u8 q0, q1 \n" + "vst1.u8 {q0}, [%0]! \n" + "bhi 3b \n" + "4: \n" + "vst1.u8 {d1[7]}, [%0] \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(src_stride), // %2 + "+r"(dst_width), // %3 + "+r"(source_y_fraction) // %4 + : + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" + ); +} + /** * SSE2 downscalers with interpolation. * @@ -3471,6 +3526,12 @@ void ScalePlaneBilinear(int src_width, int src_height, int dst_width, int source_y_fraction); void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int dx); +#if defined(HAS_SCALEFILTERROWS_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(src_width, 16)) { + ScaleFilterRows = ScaleFilterRows_NEON; + } else +#endif #if defined(HAS_SCALEFILTERROWS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) && diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 392353c33..af715d5b4 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -153,8 +153,8 @@ TEST_F(libyuvTest, ScaleDownBy2) { const int src_width = 1280; const int src_height = 720; - const int dst_width = src_width >> 1; - const int dst_height = src_height >> 1; + const int dst_width = src_width / 2; + const int dst_height = src_height / 2; int err = 0; for (int f = 0; f < 3; ++f) @@ -169,8 +169,8 @@ TEST_F(libyuvTest, ScaleDownBy4) { const int src_width = 1280; const int src_height = 720; - const int dst_width = src_width >> 2; - const int dst_height = src_height >> 2; + const int dst_width = src_width / 4; + const int dst_height = src_height / 4; int err = 0; for (int f = 0; f < 3; ++f) @@ -185,8 +185,8 @@ TEST_F(libyuvTest, ScaleDownBy34) { const int src_width = 1280; const int src_height = 720; - const int dst_width = (src_width*3) >> 2; - const int dst_height = (src_height*3) >> 2; + const int dst_width = src_width * 3 / 4; + const int dst_height = src_height * 3 / 4; int err = 0; for (int f = 0; f < 3; ++f) @@ -200,8 +200,24 @@ TEST_F(libyuvTest, ScaleDownBy34) { TEST_F(libyuvTest, ScaleDownBy38) { int src_width = 1280; int src_height = 720; - int dst_width = (src_width*3) >> 3; - int dst_height = (src_height*3) >> 3; + int dst_width = src_width * 3 / 8; + int dst_height = src_height * 3 / 8; + + int err = 0; + + for (int f = 0; f < 3; ++f) + err += TestFilter (src_width, src_height, + dst_width, dst_height, + static_cast(f)); + + EXPECT_EQ(0, err); +} + +TEST_F(libyuvTest, ScalePlaneBilinear) { + int src_width = 1280; + int src_height = 720; + int dst_width = 1366; + int dst_height = 768; int err = 0;