Neon version of ScaleRowDown34.

Review URL: http://webrtc-codereview.appspot.com/250003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@44 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2011-10-28 17:32:35 +00:00 · 2011-10-28 17:32:35 +00:00 · f626bea05f
commit f626bea05f
parent 82ca395828
2 changed files with 257 additions and 0 deletions
--- a/source/scale.cc
+++ b/source/scale.cc
@ -168,6 +168,124 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
  );
 }

+#define HAS_SCALEROWDOWN34_NEON
+// Down scale from 4 to 3 pixels.  Use the neon multilane read/write
+//  to load up the every 4th pixel into a 4 different registers.
+// Point samples 32 pixels to 24 pixels.
+static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
+                                uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "1:                                   \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n" // src line 0
+    "vmov         d2, d3                  \n" // order needs to be d0, d1, d2
+    "vst3.u8      {d0, d1, d2}, [%1]!     \n"
+    "subs         %2, #24                 \n"
+    "bhi          1b                      \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width)         // %2
+    :
+    : "d0", "d1", "d2", "d3", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "vmov.u8      d16, #3                 \n"
+    "add          %3, %0                  \n"
+    "1:                                   \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]! \n" // src line 1
+
+    // filter src line 0 with src line 1
+    // expand chars to shorts to allow for room
+    // when adding lines together
+    "vmovl.u8     q4, d4                  \n"
+    "vmovl.u8     q5, d5                  \n"
+    "vmovl.u8     q6, d6                  \n"
+    "vmovl.u8     q7, d7                  \n"
+
+    // 3 * line_0 + line_1
+    "vmlal.u8     q4, d0, d16             \n"
+    "vmlal.u8     q5, d1, d16             \n"
+    "vmlal.u8     q6, d2, d16             \n"
+    "vmlal.u8     q7, d3, d16             \n"
+
+    // (3 * line_0 + line_1) >> 2
+    "vqrshrn.u16  d0, q4, #2              \n"
+    "vqrshrn.u16  d1, q5, #2              \n"
+    "vqrshrn.u16  d2, q6, #2              \n"
+    "vqrshrn.u16  d3, q7, #2              \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q4, d1                  \n"
+    "vmlal.u8     q4, d0, d16             \n"
+    "vqrshrn.u16  d0, q4, #2              \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2              \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q4, d2                  \n"
+    "vmlal.u8     q4, d3, d16             \n"
+    "vqrshrn.u16  d2, q4, #2              \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!     \n"
+
+    "subs         %2, #24                 \n"
+    "bhi          1b                      \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "d17", "memory", "cc"
+  );
+}
+
+static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
+                                      uint8* dst_ptr, int dst_width) {
+  __asm__ volatile
+  (
+    "vmov.u8      d10, #3                 \n"
+    "add          %3, %0                  \n"
+    "1:                                   \n"
+    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n" // src line 0
+    "vld4.u8      {d4, d5, d6, d7}, [%3]! \n" // src line 1
+
+    // average src line 0 with src line 1
+    "vrhadd.u8    q0, q0, q2              \n"
+    "vrhadd.u8    q1, q1, q3              \n"
+
+    // a0 = (src[0] * 3 + s[1] * 1) >> 2
+    "vmovl.u8     q3, d1                  \n"
+    "vmlal.u8     q3, d0, d10             \n"
+    "vqrshrn.u16  d0, q3, #2              \n"
+
+    // a1 = (src[1] * 1 + s[2] * 1) >> 1
+    "vrhadd.u8    d1, d1, d2              \n"
+
+    // a2 = (src[2] * 1 + s[3] * 3) >> 2
+    "vmovl.u8     q3, d2                  \n"
+    "vmlal.u8     q3, d3, d10             \n"
+    "vqrshrn.u16  d2, q3, #2              \n"
+
+    "vst3.u8      {d0, d1, d2}, [%1]!     \n"
+
+    "subs         %2, #24                 \n"
+    "bhi          1b                      \n"
+    : "+r"(src_ptr),          // %0
+      "+r"(dst_ptr),          // %1
+      "+r"(dst_width),        // %2
+      "+r"(src_stride)        // %3
+    :
+    : "r4", "q0", "q1", "q2", "q3", "d10", "memory", "cc"
+  );
+}
+
 /**
 * SSE2 downscalers with interpolation.
 *
@ -2857,6 +2975,18 @@ static void ScalePlaneDown34(int src_width, int src_height,
                           uint8* dst_ptr, int dst_width);
  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
                           uint8* dst_ptr, int dst_width);
+#if defined(HAS_SCALEROWDOWN34_NEON)
+  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
+      (dst_width % 24 == 0) && (dst_stride % 8 == 0)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_NEON;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
+    }
+  } else
+#endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */

+#include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
 #include "unit_test.h"
 #include <stdlib.h>
@ -157,3 +158,129 @@ TEST_F(libyuvTest, ScaleDownBy4) {

  EXPECT_EQ(0, err);
 }
+
+TEST_F(libyuvTest, ScaleDownBy34) {
+  int b = 128;
+  int src_width = 1280;
+  int src_height = 720;
+  int src_width_uv = (src_width + 1) >> 1;
+  int src_height_uv = (src_height + 1) >> 1;
+
+  int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
+  int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
+
+  int src_stride_y = 2 * b + src_width;
+  int src_stride_uv = 2 * b + src_width_uv;
+
+  align_buffer_16(src_y, src_y_plane_size)
+  align_buffer_16(src_u, src_uv_plane_size)
+  align_buffer_16(src_v, src_uv_plane_size)
+
+  int dst_width = (src_width*3) >> 2;
+  int dst_height = (src_height*3) >> 2;
+
+  int dst_width_uv = (dst_width + 1) >> 1;
+  int dst_height_uv = (dst_height + 1) >> 1;
+
+  int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
+  int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
+
+  int dst_stride_y = 2 * b + dst_width;
+  int dst_stride_uv = 2 * b + dst_width_uv;
+
+  srandom(time(NULL));
+
+  int i, j;
+
+  for (i = b; i < (src_height + b); ++i) {
+    for (j = b; j < (src_width + b); ++j) {
+      src_y[(i * src_stride_y) + j] = (random() & 0xff);
+    }
+  }
+
+  for (i = b; i < (src_height_uv + b); ++i) {
+    for (j = b; j < (src_width_uv + b); ++j) {
+      src_u[(i * src_stride_uv) + j] = (random() & 0xff);
+      src_v[(i * src_stride_uv) + j] = (random() & 0xff);
+    }
+  }
+
+  int f;
+  int err = 0;
+
+  // currently three filter modes, defined as FilterMode in scale.h
+  for (f = 0; f < 3; ++f) {
+    int max_diff = 0;
+    align_buffer_16(dst_y_c, dst_y_plane_size)
+    align_buffer_16(dst_u_c, dst_uv_plane_size)
+    align_buffer_16(dst_v_c, dst_uv_plane_size)
+    align_buffer_16(dst_y_opt, dst_y_plane_size)
+    align_buffer_16(dst_u_opt, dst_uv_plane_size)
+    align_buffer_16(dst_v_opt, dst_uv_plane_size)
+
+    libyuv::MaskCpuFlagsForTest(0);
+    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+              src_u + (src_stride_uv * b) + b, src_stride_uv,
+              src_v + (src_stride_uv * b) + b, src_stride_uv,
+              src_width, src_height,
+              dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
+              dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_width, dst_height,
+              static_cast<FilterMode>(f));
+
+    libyuv::MaskCpuFlagsForTest(-1);
+    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
+              src_u + (src_stride_uv * b) + b, src_stride_uv,
+              src_v + (src_stride_uv * b) + b, src_stride_uv,
+              src_width, src_height,
+              dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
+              dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
+              dst_width, dst_height,
+              static_cast<FilterMode>(f));
+
+    // C version may be a little off from the optimized.  Order of
+    //  operations may introduce rounding somewhere.  So do a difference
+    //  of the buffers and look to see that the max difference isn't
+    //  over 2.
+    for (i = b; i < (dst_height + b); ++i) {
+      for (j = b; j < (dst_width + b); ++j) {
+        int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
+                           dst_y_opt[(i * dst_stride_y) + j]);
+        if (abs_diff > max_diff)
+          max_diff = abs_diff;
+      }
+    }
+
+    for (i = b; i < (dst_height_uv + b); ++i) {
+      for (j = b; j < (dst_width_uv + b); ++j) {
+        int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
+                           dst_u_opt[(i * dst_stride_uv) + j]);
+        if (abs_diff > max_diff)
+          max_diff = abs_diff;
+        abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
+                       dst_v_opt[(i * dst_stride_uv) + j]);
+        if (abs_diff > max_diff)
+          max_diff = abs_diff;
+
+      }
+    }
+
+    if (max_diff > 2)
+      err++;
+
+    free_aligned_buffer_16(dst_y_c)
+    free_aligned_buffer_16(dst_u_c)
+    free_aligned_buffer_16(dst_v_c)
+    free_aligned_buffer_16(dst_y_opt)
+    free_aligned_buffer_16(dst_u_opt)
+    free_aligned_buffer_16(dst_v_opt)
+  }
+
+  free_aligned_buffer_16(src_y)
+  free_aligned_buffer_16(src_u)
+  free_aligned_buffer_16(src_v)
+
+  EXPECT_EQ(0, err);
+}