rounding for arm filter

R=wangcheng@google.com, harryjin@google.com BUG=libyuv:607 Review URL: https://codereview.chromium.org/2093913004 .
2025-12-07 01:06:46 +08:00 · 2016-06-24 16:07:49 -07:00 · 2016-06-24 16:07:49 -07:00 · b8ddb5a2a7
commit b8ddb5a2a7
parent 1b3e4aee47
5 changed files with 10 additions and 12 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1600
+Version: 1601
 License: BSD
 License File: LICENSE
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1600
+#define LIBYUV_VERSION 1601
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -417,11 +417,9 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 // (1-f)a + fb can be replaced with a + f(b-a)
-#if defined(__arm__)
+#if defined(__arm__) || defined(__aarch64__)
 // arm uses 16 bit math with truncation.
 // TODO(fbarchard): add rounding.
 #define BLENDER(a, b, f) (uint8)((int)(a) + \
-    (((int)((f)) * ((int)(b) - (int)(a))) >> 16))
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 #else
 // inteluses 7 bit math with rounding.
 #define BLENDER(a, b, f) (uint8)((int)(a) + \
@ -480,7 +478,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 // Same as 8 bit arm blender but return is cast to uint16
 #define BLENDER(a, b, f) (uint16)((int)(a) + \
-    (((int)((f)) * ((int)(b) - (int)(a))) >> 16))
+    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
                       int dst_width, int x, int dx) {
@ -818,7 +816,7 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
  }
 }
-// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=605.
+// TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
 #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
 #define BLENDERC(a, b, f, s) (uint32)( \
--- a/source/scale_neon.cc
+++ b/source/scale_neon.cc
@ -612,8 +612,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
    "vmovl.u16  q10, d21                       \n"
    "vmul.s32   q11, q11, q13                  \n"
    "vmul.s32   q12, q12, q10                  \n"
-    "vshrn.s32  d18, q11, #16                  \n"
+    "vrshrn.s32  d18, q11, #16                 \n"
-    "vshrn.s32  d19, q12, #16                  \n"
+    "vrshrn.s32  d19, q12, #16                 \n"
    "vadd.s16   q8, q8, q9                     \n"
    "vmovn.s16  d6, q8                         \n"
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -626,8 +626,8 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
    "ushll2    v6.4s, v6.8h, #0                \n"
    "mul       v16.4s, v16.4s, v7.4s           \n"
    "mul       v17.4s, v17.4s, v6.4s           \n"
-    "shrn      v6.4h, v16.4s, #16              \n"
+    "rshrn      v6.4h, v16.4s, #16             \n"
-    "shrn2     v6.8h, v17.4s, #16              \n"
+    "rshrn2     v6.8h, v17.4s, #16             \n"
    "add       v4.8h, v4.8h, v6.8h             \n"
    "xtn       v4.8b, v4.8h                    \n"