Neon version of ScaleRowDown34.

Review URL: http://webrtc-codereview.appspot.com/250003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@44 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2011-10-28 17:32:35 +00:00 · 2011-10-28 17:32:35 +00:00 · f626bea05f
commit f626bea05f
parent 82ca395828
2 changed files with 257 additions and 0 deletions
--- a/source/scale.cc
+++ b/source/scale.cc
@ -168,6 +168,124 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
  );
 }
 #define HAS_SCALEROWDOWN34_NEON
 // Down scale from 4 to 3 pixels.  Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
 static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */,
                                uint8* dst_ptr, int dst_width) {
  __asm__ volatile
  (
    "1:                                   \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n" // src line 0
    "vmov         d2, d3                  \n" // order needs to be d0, d1, d2
    "vst3.u8      {d0, d1, d2}, [%1]!     \n"
    "subs         %2, #24                 \n"
    "bhi          1b                      \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width)         // %2
    :
    : "d0", "d1", "d2", "d3", "memory", "cc"
  );
 }
 static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
                                      uint8* dst_ptr, int dst_width) {
  __asm__ volatile
  (
    "vmov.u8      d16, #3                 \n"
    "add          %3, %0                  \n"
    "1:                                   \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n" // src line 0
    "vld4.u8      {d4, d5, d6, d7}, [%3]! \n" // src line 1
    // filter src line 0 with src line 1
    // expand chars to shorts to allow for room
    // when adding lines together
    "vmovl.u8     q4, d4                  \n"
    "vmovl.u8     q5, d5                  \n"
    "vmovl.u8     q6, d6                  \n"
    "vmovl.u8     q7, d7                  \n"
    // 3 * line_0 + line_1
    "vmlal.u8     q4, d0, d16             \n"
    "vmlal.u8     q5, d1, d16             \n"
    "vmlal.u8     q6, d2, d16             \n"
    "vmlal.u8     q7, d3, d16             \n"
    // (3 * line_0 + line_1) >> 2
    "vqrshrn.u16  d0, q4, #2              \n"
    "vqrshrn.u16  d1, q5, #2              \n"
    "vqrshrn.u16  d2, q6, #2              \n"
    "vqrshrn.u16  d3, q7, #2              \n"
    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q4, d1                  \n"
    "vmlal.u8     q4, d0, d16             \n"
    "vqrshrn.u16  d0, q4, #2              \n"
    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2              \n"
    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q4, d2                  \n"
    "vmlal.u8     q4, d3, d16             \n"
    "vqrshrn.u16  d2, q4, #2              \n"
    "vst3.u8      {d0, d1, d2}, [%1]!     \n"
    "subs         %2, #24                 \n"
    "bhi          1b                      \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    :
    : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "d17", "memory", "cc"
  );
 }
 static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
                                      uint8* dst_ptr, int dst_width) {
  __asm__ volatile
  (
    "vmov.u8      d10, #3                 \n"
    "add          %3, %0                  \n"
    "1:                                   \n"
    "vld4.u8      {d0, d1, d2, d3}, [%0]! \n" // src line 0
    "vld4.u8      {d4, d5, d6, d7}, [%3]! \n" // src line 1
    // average src line 0 with src line 1
    "vrhadd.u8    q0, q0, q2              \n"
    "vrhadd.u8    q1, q1, q3              \n"
    // a0 = (src[0] * 3 + s[1] * 1) >> 2
    "vmovl.u8     q3, d1                  \n"
    "vmlal.u8     q3, d0, d10             \n"
    "vqrshrn.u16  d0, q3, #2              \n"
    // a1 = (src[1] * 1 + s[2] * 1) >> 1
    "vrhadd.u8    d1, d1, d2              \n"
    // a2 = (src[2] * 1 + s[3] * 3) >> 2
    "vmovl.u8     q3, d2                  \n"
    "vmlal.u8     q3, d3, d10             \n"
    "vqrshrn.u16  d2, q3, #2              \n"
    "vst3.u8      {d0, d1, d2}, [%1]!     \n"
    "subs         %2, #24                 \n"
    "bhi          1b                      \n"
    : "+r"(src_ptr),          // %0
      "+r"(dst_ptr),          // %1
      "+r"(dst_width),        // %2
      "+r"(src_stride)        // %3
    :
    : "r4", "q0", "q1", "q2", "q3", "d10", "memory", "cc"
  );
 }
 /**
 * SSE2 downscalers with interpolation.
 *
@ -2857,6 +2975,18 @@ static void ScalePlaneDown34(int src_width, int src_height,
                           uint8* dst_ptr, int dst_width);
  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
                           uint8* dst_ptr, int dst_width);
 #if defined(HAS_SCALEROWDOWN34_NEON)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
      (dst_width % 24 == 0) && (dst_stride % 8 == 0)) {
    if (!filtering) {
      ScaleRowDown34_0 = ScaleRowDown34_NEON;
      ScaleRowDown34_1 = ScaleRowDown34_NEON;
    } else {
      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
    }
  } else
 #endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
--- a/unit_test/scale_test.cc
+++ b/unit_test/scale_test.cc
@ -8,6 +8,7 @@
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
 #include "unit_test.h"
 #include <stdlib.h>
@ -157,3 +158,129 @@ TEST_F(libyuvTest, ScaleDownBy4) {
  EXPECT_EQ(0, err);
 }
 TEST_F(libyuvTest, ScaleDownBy34) {
  int b = 128;
  int src_width = 1280;
  int src_height = 720;
  int src_width_uv = (src_width + 1) >> 1;
  int src_height_uv = (src_height + 1) >> 1;
  int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b));
  int src_uv_plane_size = (src_width_uv + (2 * b)) * (src_height_uv + (2 * b));
  int src_stride_y = 2 * b + src_width;
  int src_stride_uv = 2 * b + src_width_uv;
  align_buffer_16(src_y, src_y_plane_size)
  align_buffer_16(src_u, src_uv_plane_size)
  align_buffer_16(src_v, src_uv_plane_size)
  int dst_width = (src_width*3) >> 2;
  int dst_height = (src_height*3) >> 2;
  int dst_width_uv = (dst_width + 1) >> 1;
  int dst_height_uv = (dst_height + 1) >> 1;
  int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b));
  int dst_uv_plane_size = (dst_width_uv + (2 * b)) * (dst_height_uv + (2 * b));
  int dst_stride_y = 2 * b + dst_width;
  int dst_stride_uv = 2 * b + dst_width_uv;
  srandom(time(NULL));
  int i, j;
  for (i = b; i < (src_height + b); ++i) {
    for (j = b; j < (src_width + b); ++j) {
      src_y[(i * src_stride_y) + j] = (random() & 0xff);
    }
  }
  for (i = b; i < (src_height_uv + b); ++i) {
    for (j = b; j < (src_width_uv + b); ++j) {
      src_u[(i * src_stride_uv) + j] = (random() & 0xff);
      src_v[(i * src_stride_uv) + j] = (random() & 0xff);
    }
  }
  int f;
  int err = 0;
  // currently three filter modes, defined as FilterMode in scale.h
  for (f = 0; f < 3; ++f) {
    int max_diff = 0;
    align_buffer_16(dst_y_c, dst_y_plane_size)
    align_buffer_16(dst_u_c, dst_uv_plane_size)
    align_buffer_16(dst_v_c, dst_uv_plane_size)
    align_buffer_16(dst_y_opt, dst_y_plane_size)
    align_buffer_16(dst_u_opt, dst_uv_plane_size)
    align_buffer_16(dst_v_opt, dst_uv_plane_size)
    libyuv::MaskCpuFlagsForTest(0);
    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
              src_u + (src_stride_uv * b) + b, src_stride_uv,
              src_v + (src_stride_uv * b) + b, src_stride_uv,
              src_width, src_height,
              dst_y_c + (dst_stride_y * b) + b, dst_stride_y,
              dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv,
              dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv,
              dst_width, dst_height,
              static_cast<FilterMode>(f));
    libyuv::MaskCpuFlagsForTest(-1);
    I420Scale(src_y + (src_stride_y * b) + b, src_stride_y,
              src_u + (src_stride_uv * b) + b, src_stride_uv,
              src_v + (src_stride_uv * b) + b, src_stride_uv,
              src_width, src_height,
              dst_y_opt + (dst_stride_y * b) + b, dst_stride_y,
              dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv,
              dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv,
              dst_width, dst_height,
              static_cast<FilterMode>(f));
    // C version may be a little off from the optimized.  Order of
    //  operations may introduce rounding somewhere.  So do a difference
    //  of the buffers and look to see that the max difference isn't
    //  over 2.
    for (i = b; i < (dst_height + b); ++i) {
      for (j = b; j < (dst_width + b); ++j) {
        int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] -
                           dst_y_opt[(i * dst_stride_y) + j]);
        if (abs_diff > max_diff)
          max_diff = abs_diff;
      }
    }
    for (i = b; i < (dst_height_uv + b); ++i) {
      for (j = b; j < (dst_width_uv + b); ++j) {
        int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] -
                           dst_u_opt[(i * dst_stride_uv) + j]);
        if (abs_diff > max_diff)
          max_diff = abs_diff;
        abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] -
                       dst_v_opt[(i * dst_stride_uv) + j]);
        if (abs_diff > max_diff)
          max_diff = abs_diff;
      }
    }
    if (max_diff > 2)
      err++;
    free_aligned_buffer_16(dst_y_c)
    free_aligned_buffer_16(dst_u_c)
    free_aligned_buffer_16(dst_v_c)
    free_aligned_buffer_16(dst_y_opt)
    free_aligned_buffer_16(dst_u_opt)
    free_aligned_buffer_16(dst_v_opt)
  }
  free_aligned_buffer_16(src_y)
  free_aligned_buffer_16(src_u)
  free_aligned_buffer_16(src_v)
  EXPECT_EQ(0, err);
 }