diff --git a/source/scale.cc b/source/scale.cc index d3b7d3332..6be7f4ed8 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -168,6 +168,124 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, ); } +#define HAS_SCALEROWDOWN34_NEON +// Down scale from 4 to 3 pixels. Use the neon multilane read/write +// to load up the every 4th pixel into a 4 different registers. +// Point samples 32 pixels to 24 pixels. +static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, + uint8* dst_ptr, int dst_width) { + __asm__ volatile + ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order needs to be d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "d0", "d1", "d2", "d3", "memory", "cc" + ); +} + +static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm__ volatile + ( + "vmov.u8 d16, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // filter src line 0 with src line 1 + // expand chars to shorts to allow for room + // when adding lines together + "vmovl.u8 q4, d4 \n" + "vmovl.u8 q5, d5 \n" + "vmovl.u8 q6, d6 \n" + "vmovl.u8 q7, d7 \n" + + // 3 * line_0 + line_1 + "vmlal.u8 q4, d0, d16 \n" + "vmlal.u8 q5, d1, d16 \n" + "vmlal.u8 q6, d2, d16 \n" + "vmlal.u8 q7, d3, d16 \n" + + // (3 * line_0 + line_1) >> 2 + "vqrshrn.u16 d0, q4, #2 \n" + "vqrshrn.u16 d1, q5, #2 \n" + "vqrshrn.u16 d2, q6, #2 \n" + "vqrshrn.u16 d3, q7, #2 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q4, d1 \n" + "vmlal.u8 q4, d0, d16 \n" + "vqrshrn.u16 d0, q4, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q4, d2 \n" + "vmlal.u8 q4, d3, d16 \n" + "vqrshrn.u16 d2, q4, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "d17", "memory", "cc" + ); +} + +static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm__ volatile + ( + "vmov.u8 d10, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + + // average src line 0 with src line 1 + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" + + // a0 = (src[0] * 3 + s[1] * 1) >> 2 + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d10 \n" + "vqrshrn.u16 d0, q3, #2 \n" + + // a1 = (src[1] * 1 + s[2] * 1) >> 1 + "vrhadd.u8 d1, d1, d2 \n" + + // a2 = (src[2] * 1 + s[3] * 3) >> 2 + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d10 \n" + "vqrshrn.u16 d2, q3, #2 \n" + + "vst3.u8 {d0, d1, d2}, [%1]! \n" + + "subs %2, #24 \n" + "bhi 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_stride) // %3 + : + : "r4", "q0", "q1", "q2", "q3", "d10", "memory", "cc" + ); +} + /** * SSE2 downscalers with interpolation. * @@ -2857,6 +2975,18 @@ static void ScalePlaneDown34(int src_width, int src_height, uint8* dst_ptr, int dst_width); void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); +#if defined(HAS_SCALEROWDOWN34_NEON) + if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && + (dst_width % 24 == 0) && (dst_stride % 8 == 0)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; + } + } else +#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && (dst_width % 24 == 0) && (src_stride % 16 == 0) && diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index e147d78b0..1d41668cf 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "libyuv/cpu_id.h" #include "libyuv/scale.h" #include "unit_test.h" #include @@ -157,3 +158,129 @@ TEST_F(libyuvTest, ScaleDownBy4) { EXPECT_EQ(0, err); } + +TEST_F(libyuvTest, ScaleDownBy34) { + int b = 128; + int src_width = 1280; + int src_height = 720; + int src_width_uv = (src_width + 1) >> 1; + int src_height_uv = (src_height + 1) >> 1; + + int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)); + int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b)); + + int src_stride_y = 2 * b + src_width; + int src_stride_uv = 2 * b + src_width_uv; + + align_buffer_16(src_y, src_y_plane_size) + align_buffer_16(src_u, src_uv_plane_size) + align_buffer_16(src_v, src_uv_plane_size) + + int dst_width = (src_width*3) >> 2; + int dst_height = (src_height*3) >> 2; + + int dst_width_uv = (dst_width + 1) >> 1; + int dst_height_uv = (dst_height + 1) >> 1; + + int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)); + int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b)); + + int dst_stride_y = 2 * b + dst_width; + int dst_stride_uv = 2 * b + dst_width_uv; + + srandom(time(NULL)); + + int i, j; + + for (i = b; i < (src_height + b); ++i) { + for (j = b; j < (src_width + b); ++j) { + src_y[(i * src_stride_y) + j] = (random() & 0xff); + } + } + + for (i = b; i < (src_height_uv + b); ++i) { + for (j = b; j < (src_width_uv + b); ++j) { + src_u[(i * src_stride_uv) + j] = (random() & 0xff); + src_v[(i * src_stride_uv) + j] = (random() & 0xff); + } + } + + int f; + int err = 0; + + // currently three filter modes, defined as FilterMode in scale.h + for (f = 0; f < 3; ++f) { + int max_diff = 0; + align_buffer_16(dst_y_c, dst_y_plane_size) + align_buffer_16(dst_u_c, dst_uv_plane_size) + align_buffer_16(dst_v_c, dst_uv_plane_size) + align_buffer_16(dst_y_opt, dst_y_plane_size) + align_buffer_16(dst_u_opt, dst_uv_plane_size) + align_buffer_16(dst_v_opt, dst_uv_plane_size) + + libyuv::MaskCpuFlagsForTest(0); + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_c + (dst_stride_y * b) + b, dst_stride_y, + dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, + static_cast(f)); + + libyuv::MaskCpuFlagsForTest(-1); + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, + dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, + static_cast(f)); + + // C version may be a little off from the optimized. Order of + // operations may introduce rounding somewhere. So do a difference + // of the buffers and look to see that the max difference isn't + // over 2. + for (i = b; i < (dst_height + b); ++i) { + for (j = b; j < (dst_width + b); ++j) { + int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] - + dst_y_opt[(i * dst_stride_y) + j]); + if (abs_diff > max_diff) + max_diff = abs_diff; + } + } + + for (i = b; i < (dst_height_uv + b); ++i) { + for (j = b; j < (dst_width_uv + b); ++j) { + int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] - + dst_u_opt[(i * dst_stride_uv) + j]); + if (abs_diff > max_diff) + max_diff = abs_diff; + abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] - + dst_v_opt[(i * dst_stride_uv) + j]); + if (abs_diff > max_diff) + max_diff = abs_diff; + + } + } + + if (max_diff > 2) + err++; + + free_aligned_buffer_16(dst_y_c) + free_aligned_buffer_16(dst_u_c) + free_aligned_buffer_16(dst_v_c) + free_aligned_buffer_16(dst_y_opt) + free_aligned_buffer_16(dst_u_opt) + free_aligned_buffer_16(dst_v_opt) + } + + free_aligned_buffer_16(src_y) + free_aligned_buffer_16(src_u) + free_aligned_buffer_16(src_v) + + EXPECT_EQ(0, err); +}