Frank Barchard 451af5e922 scale by 1 for neon implemented
void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
    "uxtl2      v1.4s, v1.8h                   \n"
    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
    "scvtf      v1.4s, v1.4s                   \n"
    "fcvtn      v4.4h, v2.4s                   \n"  // 8 floatsgit
    "fcvtn2     v4.8h, v1.4s                   \n"
   MEMACCESS(1)
    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
    "b.gt       1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  :
  : "cc", "memory", "v1", "v2", "v4"
  );
}

void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
    "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
    "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
    "uxtl2      v1.4s, v1.8h                   \n"
    "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
    "scvtf      v1.4s, v1.4s                   \n"
    "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
    "fmul       v1.4s, v1.4s, %3.s[0]          \n"
    "uqshrn     v4.4h, v2.4s, #13              \n"  // isolate halffloat
    "uqshrn2    v4.8h, v1.4s, #13              \n"
   MEMACCESS(1)
    "st1        {v4.16b}, [%1], #16            \n"  // store 8 shorts
    "b.gt       1b                             \n"
  : "+r"(src),    // %0
    "+r"(dst),    // %1
    "+r"(width)   // %2
  : "w"(scale * 1.9259299444e-34f)    // %3
  : "cc", "memory", "v1", "v2", "v4"
  );
}

TEST=LibYUVPlanarTest.TestHalfFloatPlane_One
BUG=libyuv:560
R=hubbe@chromium.org

Review URL: https://codereview.chromium.org/2430313008 .
2016-10-21 14:30:03 -07:00
..
compare_common.cc xmmword cast for clang 2015-08-18 11:13:12 -07:00
compare_gcc.cc nolint removed 2015-08-31 10:52:13 -07:00
compare_neon64.cc xmmword cast for clang 2015-08-18 11:13:12 -07:00
compare_neon.cc xmmword cast for clang 2015-08-18 11:13:12 -07:00
compare_win.cc xmmword cast for clang 2015-08-18 11:13:12 -07:00
compare.cc xmmword cast for clang 2015-08-18 11:13:12 -07:00
convert_argb.cc Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions 2016-10-19 11:10:51 -07:00
convert_from_argb.cc Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions 2016-10-19 11:10:51 -07:00
convert_from.cc Remove I411 support. 2016-10-11 11:14:16 -07:00
convert_jpeg.cc Remove I411 support. 2016-10-11 11:14:16 -07:00
convert_to_argb.cc Remove I411 support. 2016-10-11 11:14:16 -07:00
convert_to_i420.cc Remove I411 support. 2016-10-11 11:14:16 -07:00
convert.cc Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions 2016-10-19 11:10:51 -07:00
cpu_id.cc cpu_id cleanup. no functional change. 2016-10-18 12:26:02 -07:00
mjpeg_decoder.cc Suppress MJPEG fprintf() runtime warning 2016-08-22 16:30:36 -07:00
mjpeg_validate.cc validate scan EOI from end for better coverage 2015-09-14 10:58:51 -07:00
planar_functions.cc scale by 1 for neon implemented 2016-10-21 14:30:03 -07:00
rotate_any.cc rename MIPS_DSPR2 to DSPR2 for consistency 2016-02-05 14:49:54 -08:00
rotate_argb.cc Add MIPS SIMD Arch (MSA) optimized ARGBMirrorRow function 2016-09-26 16:28:01 -07:00
rotate_common.cc rotate include and proto cleanup 2015-07-22 18:09:04 -07:00
rotate_gcc.cc use visual c 32 bit code for clangcl 2015-08-11 10:10:45 -07:00
rotate_mips.cc white space fixes for MIPS 2016-05-24 14:17:18 -07:00
rotate_neon64.cc Remove initialize to zero on output variables for inline. 2016-04-18 16:24:26 -07:00
rotate_neon.cc Remove initialize to zero on output variables for inline. 2016-04-18 16:24:26 -07:00
rotate_win.cc use visual c 32 bit code for clangcl 2015-08-11 10:10:45 -07:00
rotate.cc Add MIPS SIMD Arch (MSA) optimized MirrorRow function 2016-09-22 16:12:22 -07:00
row_any.cc scale by 1 for neon implemented 2016-10-21 14:30:03 -07:00
row_common.cc Remove I411 support. 2016-10-11 11:14:16 -07:00
row_gcc.cc scale by 1 for neon implemented 2016-10-21 14:30:03 -07:00
row_mips.cc white space fixes for MIPS 2016-05-24 14:17:18 -07:00
row_msa.cc Add MSA optimized ARGB4444ToI420 and ARGB4444ToARGB functions 2016-10-19 11:10:51 -07:00
row_neon64.cc scale by 1 for neon implemented 2016-10-21 14:30:03 -07:00
row_neon.cc Remove I411 support. 2016-10-11 11:14:16 -07:00
row_win.cc Port HalfFloatRow_SSE2 to AVX2 but not using F16C. 2016-10-14 19:01:41 -07:00
scale_any.cc Odd width variation of scale down by 2 for subsampling 2016-01-06 15:12:17 -08:00
scale_argb.cc rename MIPS_DSPR2 to DSPR2 for consistency 2016-02-05 14:49:54 -08:00
scale_common.cc Fix some comment typos 2016-09-15 15:38:19 -07:00
scale_gcc.cc Fix some comment typos 2016-09-15 15:38:19 -07:00
scale_mips.cc rename MIPS_DSPR2 to DSPR2 for consistency 2016-02-05 14:49:54 -08:00
scale_neon64.cc fix multi-line comment warning 2016-09-16 15:16:39 -07:00
scale_neon.cc fix multi-line comment warning 2016-09-16 15:16:39 -07:00
scale_win.cc YUV scale filter columns improved filtering accuracy 2016-06-23 20:16:55 -07:00
scale.cc Scale by 3/8 only if source is multiple of 8 tall. 2016-09-16 14:57:47 -07:00
video_common.cc Treat YU12 as an alias for I420. Simplify setting of inv_crop_height. 2016-06-16 12:49:17 +02:00