Add rounding to InterpolateRow for improved quality and consistency.

Remove inaccurate specializations for 1/4 and 3/4, since they round incorrectly. Specialize for 100% and 50% are kept due to performance. Make C and ARM code match SSSE3. Make unittests expect zero difference. BUG=libyuv:535 R=harryjin@google.com Review URL: https://codereview.chromium.org/1533643005 .
2025-12-06 16:56:55 +08:00 · 2015-12-17 15:24:06 -08:00 · 2015-12-17 15:24:06 -08:00 · f4447745ae
commit f4447745ae
parent 1ccbf8fb7b
12 changed files with 57 additions and 512 deletions
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -175,7 +175,6 @@ extern "C" {
 #define HAS_ARGBUNATTENUATEROW_SSE2
 #define HAS_COMPUTECUMULATIVESUMROW_SSE2
 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 #define HAS_INTERPOLATEROW_SSE2
 #define HAS_INTERPOLATEROW_SSSE3
 #define HAS_RGBCOLORTABLEROW_X86
 #define HAS_SOBELROW_SSE2
@ -1838,9 +1837,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
                      ptrdiff_t src_stride_ptr,
                      int width, int source_y_fraction);
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride_ptr, int width,
                         int source_y_fraction);
 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                          ptrdiff_t src_stride_ptr, int width,
                          int source_y_fraction);
@ -1856,9 +1852,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
 void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
                             ptrdiff_t src_stride_ptr, int width,
                             int source_y_fraction);
 void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                             ptrdiff_t src_stride_ptr, int width,
                             int source_y_fraction);
 void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
                              ptrdiff_t src_stride_ptr, int width,
                              int source_y_fraction);
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1870,14 +1870,6 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
    height = 1;
    src_stride0 = src_stride1 = dst_stride = 0;
  }
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
@ -2467,14 +2459,6 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
@ -2571,14 +2555,6 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -593,9 +593,6 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
 #ifdef HAS_INTERPOLATEROW_SSSE3
 ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSE2
 ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
 ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #endif
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2211,27 +2211,30 @@ static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,
 void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
                      ptrdiff_t src_stride,
                      int width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
+  int y1_fraction = source_y_fraction >> 1;
-  int y0_fraction = 256 - y1_fraction;
+  int y0_fraction = 128 - y1_fraction;
  const uint8* src_ptr1 = src_ptr + src_stride;
  int x;
-  if (source_y_fraction == 0) {
+  if (y1_fraction == 0) {
    memcpy(dst_ptr, src_ptr, width);
    return;
  }
-  if (source_y_fraction == 128) {
+  if (y1_fraction == 64) {
    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);
    return;
  }
  for (x = 0; x < width - 1; x += 2) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[0] = 
-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+		(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 64) >> 7;
    dst_ptr[1] = 
 		(src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 64) >> 7;
    src_ptr += 2;
    src_ptr1 += 2;
    dst_ptr += 2;
  }
  if (width & 1) {
-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_ptr[0] = 
 		(src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 64) >> 7;
  }
 }
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -4794,12 +4794,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "shr       %3                              \n"
    "cmp       $0x0,%3                         \n"
    "je        100f                            \n"
    "cmp       $0x20,%3                        \n"
    "je        75f                             \n"
    "cmp       $0x40,%3                        \n"
    "je        50f                             \n"
    "cmp       $0x60,%3                        \n"
    "je        25f                             \n"
    "movd      %3,%%xmm0                       \n"
    "neg       %3                              \n"
@ -4808,6 +4804,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "punpcklbw %%xmm0,%%xmm5                   \n"
    "punpcklwd %%xmm5,%%xmm5                   \n"
    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    "mov       $0x400040,%%eax                 \n"
    "movd      %%eax,%%xmm4                    \n"
    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
    // General purpose row blend.
    LABELALIGN
@ -4819,6 +4818,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "punpckhbw %%xmm2,%%xmm1                   \n"
    "pmaddubsw %%xmm5,%%xmm0                   \n"
    "pmaddubsw %%xmm5,%%xmm1                   \n"
    "paddw     %%xmm4,%%xmm0                   \n"
    "paddw     %%xmm4,%%xmm1                   \n"
    "psrlw     $0x7,%%xmm0                     \n"
    "psrlw     $0x7,%%xmm1                     \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
@ -4828,19 +4829,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "jg        1b                              \n"
    "jmp       99f                             \n"
    // Blend 25 / 75.
    LABELALIGN
  "25:                                         \n"
    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
    "pavgb     %%xmm1,%%xmm0                   \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        25b                             \n"
    "jmp       99f                             \n"
    // Blend 50 / 50.
    LABELALIGN
  "50:                                         \n"
@ -4853,19 +4841,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "jg        50b                             \n"
    "jmp       99f                             \n"
    // Blend 75 / 25.
    LABELALIGN
  "75:                                         \n"
    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
    "pavgb     %%xmm1,%%xmm0                   \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        75b                             \n"
    "jmp       99f                             \n"
    // Blend 100 / 0 - Copy row unchanged.
    LABELALIGN
  "100:                                        \n"
@ -4881,8 +4856,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    "+r"(dst_width),  // %2
    "+r"(source_y_fraction)  // %3
  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", NACL_R14
+  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm5"
+    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
  );
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
@ -4897,12 +4872,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    "cmp       $0x0,%3                         \n"
    "je        100f                            \n"
    "sub       %1,%0                           \n"
    "cmp       $0x20,%3                        \n"
    "je        75f                             \n"
    "cmp       $0x40,%3                        \n"
    "je        50f                             \n"
    "cmp       $0x60,%3                        \n"
    "je        25f                             \n"
    "vmovd      %3,%%xmm0                      \n"
    "neg        %3                             \n"
@ -4912,6 +4883,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
    "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
    "vpermd     %%ymm5,%%ymm0,%%ymm5           \n"
    "mov        $0x400040,%%eax                \n"
    "vmovd      %%eax,%%xmm4                   \n"
    "vbroadcastss %%xmm4,%%ymm4                \n"
    // General purpose row blend.
    LABELALIGN
@ -4922,6 +4896,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
    "vpmaddubsw %%ymm5,%%ymm0,%%ymm0           \n"
    "vpmaddubsw %%ymm5,%%ymm1,%%ymm1           \n"
    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
    "vpsrlw     $0x7,%%ymm1,%%ymm1             \n"
    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
@ -4931,19 +4907,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    "jg        1b                              \n"
    "jmp       99f                             \n"
    // Blend 25 / 75.
    LABELALIGN
  "25:                                         \n"
    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
    MEMOPREG(vmovdqu,0x00,1,4,1,ymm1)
    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
    "lea       " MEMLEA(0x20,1) ",%1           \n"
    "sub       $0x20,%2                        \n"
    "jg        25b                             \n"
    "jmp       99f                             \n"
    // Blend 50 / 50.
    LABELALIGN
  "50:                                         \n"
@ -4955,19 +4918,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    "jg        50b                             \n"
    "jmp       99f                             \n"
    // Blend 75 / 25.
    LABELALIGN
  "75:                                         \n"
    "vmovdqu    " MEMACCESS(1) ",%%ymm1        \n"
    MEMOPREG(vmovdqu,0x00,1,4,1,ymm0)
    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
    "vpavgb     %%ymm1,%%ymm0,%%ymm0           \n"
    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
    "lea       " MEMLEA(0x20,1) ",%1           \n"
    "sub       $0x20,%2                        \n"
    "jg        75b                             \n"
    "jmp       99f                             \n"
    // Blend 100 / 0 - Copy row unchanged.
    LABELALIGN
  "100:                                        \n"
@ -4982,123 +4932,12 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    "+c"(dst_width),  // %2
    "+r"(source_y_fraction)  // %3
  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", NACL_R14
+  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm5"
+    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
  );
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) {
  asm volatile (
    "sub       %1,%0                           \n"
    "shr       %3                              \n"
    "cmp       $0x0,%3                         \n"
    "je        100f                            \n"
    "cmp       $0x20,%3                        \n"
    "je        75f                             \n"
    "cmp       $0x40,%3                        \n"
    "je        50f                             \n"
    "cmp       $0x60,%3                        \n"
    "je        25f                             \n"
    "movd      %3,%%xmm0                       \n"
    "neg       %3                              \n"
    "add       $0x80,%3                        \n"
    "movd      %3,%%xmm5                       \n"
    "punpcklbw %%xmm0,%%xmm5                   \n"
    "punpcklwd %%xmm5,%%xmm5                   \n"
    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
    "pxor      %%xmm4,%%xmm4                   \n"
    // General purpose row blend.
    LABELALIGN
  "1:                                          \n"
    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
    "movdqa    %%xmm0,%%xmm1                   \n"
    "movdqa    %%xmm2,%%xmm3                   \n"
    "punpcklbw %%xmm4,%%xmm2                   \n"
    "punpckhbw %%xmm4,%%xmm3                   \n"
    "punpcklbw %%xmm4,%%xmm0                   \n"
    "punpckhbw %%xmm4,%%xmm1                   \n"
    "psubw     %%xmm0,%%xmm2                   \n"
    "psubw     %%xmm1,%%xmm3                   \n"
    "paddw     %%xmm2,%%xmm2                   \n"
    "paddw     %%xmm3,%%xmm3                   \n"
    "pmulhw    %%xmm5,%%xmm2                   \n"
    "pmulhw    %%xmm5,%%xmm3                   \n"
    "paddw     %%xmm2,%%xmm0                   \n"
    "paddw     %%xmm3,%%xmm1                   \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        1b                              \n"
    "jmp       99f                             \n"
    // Blend 25 / 75.
    LABELALIGN
  "25:                                         \n"
    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
    "pavgb     %%xmm1,%%xmm0                   \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        25b                             \n"
    "jmp       99f                             \n"
    // Blend 50 / 50.
    LABELALIGN
  "50:                                         \n"
    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
    "pavgb     %%xmm1,%%xmm0                   \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        50b                             \n"
    "jmp       99f                             \n"
    // Blend 75 / 25.
    LABELALIGN
  "75:                                         \n"
    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
    "pavgb     %%xmm1,%%xmm0                   \n"
    "pavgb     %%xmm1,%%xmm0                   \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        75b                             \n"
    "jmp       99f                             \n"
    // Blend 100 / 0 - Copy row unchanged.
    LABELALIGN
  "100:                                        \n"
    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
    "lea       " MEMLEA(0x10,1) ",%1           \n"
    "sub       $0x10,%2                        \n"
    "jg        100b                            \n"
  "99:                                         \n"
  : "+r"(dst_ptr),    // %0
    "+r"(src_ptr),    // %1
    "+r"(dst_width),  // %2
    "+r"(source_y_fraction)  // %3
  : "r"((intptr_t)(src_stride))  // %4
  : "memory", "cc", NACL_R14
    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
  );
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -2259,19 +2259,16 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
 void InterpolateRow_NEON(uint8* dst_ptr,
                         const uint8* src_ptr, ptrdiff_t src_stride,
                         int dst_width, int source_y_fraction) {
  int y1_fraction = source_y_fraction >> 1;
  asm volatile (
    "cmp        %4, #0                         \n"
    "beq        100f                           \n"
    "add        %2, %1                         \n"
    "cmp        %4, #64                        \n"
    "beq        75f                            \n"
    "cmp        %4, #128                       \n"
    "beq        50f                            \n"
    "cmp        %4, #192                       \n"
    "beq        25f                            \n"
    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
+    "rsb        %4, #128                       \n"
    "vdup.8     d4, %4                         \n"
    // General purpose row blend.
  "1:                                          \n"
@ -2284,27 +2281,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "vmull.u8   q14, d1, d4                    \n"
    "vmlal.u8   q13, d2, d5                    \n"
    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
+    "vrshrn.u16 d0, q13, #7                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
+    "vrshrn.u16 d1, q14, #7                    \n"
    MEMACCESS(0)
    "vst1.8     {q0}, [%0]!                    \n"
    "bgt        1b                             \n"
    "b          99f                            \n"
    // Blend 25 / 75.
  "25:                                         \n"
    MEMACCESS(1)
    "vld1.8     {q0}, [%1]!                    \n"
    MEMACCESS(2)
    "vld1.8     {q1}, [%2]!                    \n"
    "subs       %3, %3, #16                    \n"
    "vrhadd.u8  q0, q1                         \n"
    "vrhadd.u8  q0, q1                         \n"
    MEMACCESS(0)
    "vst1.8     {q0}, [%0]!                    \n"
    "bgt        25b                            \n"
    "b          99f                            \n"
    // Blend 50 / 50.
  "50:                                         \n"
    MEMACCESS(1)
@ -2318,20 +2301,6 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "bgt        50b                            \n"
    "b          99f                            \n"
    // Blend 75 / 25.
  "75:                                         \n"
    MEMACCESS(1)
    "vld1.8     {q1}, [%1]!                    \n"
    MEMACCESS(2)
    "vld1.8     {q0}, [%2]!                    \n"
    "subs       %3, %3, #16                    \n"
    "vrhadd.u8  q0, q1                         \n"
    "vrhadd.u8  q0, q1                         \n"
    MEMACCESS(0)
    "vst1.8     {q0}, [%0]!                    \n"
    "bgt        75b                            \n"
    "b          99f                            \n"
    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
    MEMACCESS(1)
@ -2346,7 +2315,7 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "+r"(src_ptr),          // %1
    "+r"(src_stride),       // %2
    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
+    "+r"(y1_fraction)       // %4
  :
  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
  );
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2336,18 +2336,14 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
 void InterpolateRow_NEON(uint8* dst_ptr,
                         const uint8* src_ptr, ptrdiff_t src_stride,
                         int dst_width, int source_y_fraction) {
-  int y1_fraction = source_y_fraction;
+  int y1_fraction = source_y_fraction >> 1;
-  int y0_fraction = 256 - y1_fraction;
+  int y0_fraction = 128 - y1_fraction;
  const uint8* src_ptr1 = src_ptr + src_stride;
  asm volatile (
    "cmp        %w4, #0                        \n"
    "b.eq       100f                           \n"
    "cmp        %w4, #64                       \n"
    "b.eq       75f                            \n"
    "cmp        %w4, #128                      \n"
    "b.eq       50f                            \n"
    "cmp        %w4, #192                      \n"
    "b.eq       25f                            \n"
    "dup        v5.16b, %w4                    \n"
    "dup        v4.16b, %w5                    \n"
@ -2362,27 +2358,13 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "umull2     v3.8h, v0.16b, v4.16b          \n"
    "umlal      v2.8h, v1.8b,  v5.8b           \n"
    "umlal2     v3.8h, v1.16b, v5.16b          \n"
-    "rshrn      v0.8b,  v2.8h, #8              \n"
+    "rshrn      v0.8b,  v2.8h, #7              \n"
-    "rshrn2     v0.16b, v3.8h, #8              \n"
+    "rshrn2     v0.16b, v3.8h, #7              \n"
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"
    "b.gt       1b                             \n"
    "b          99f                            \n"
    // Blend 25 / 75.
  "25:                                         \n"
    MEMACCESS(1)
    "ld1        {v0.16b}, [%1], #16            \n"
    MEMACCESS(2)
    "ld1        {v1.16b}, [%2], #16            \n"
    "subs       %w3, %w3, #16                  \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"
    "b.gt       25b                            \n"
    "b          99f                            \n"
    // Blend 50 / 50.
  "50:                                         \n"
    MEMACCESS(1)
@ -2396,20 +2378,6 @@ void InterpolateRow_NEON(uint8* dst_ptr,
    "b.gt       50b                            \n"
    "b          99f                            \n"
    // Blend 75 / 25.
  "75:                                         \n"
    MEMACCESS(1)
    "ld1        {v1.16b}, [%1], #16            \n"
    MEMACCESS(2)
    "ld1        {v0.16b}, [%2], #16            \n"
    "subs       %w3, %w3, #16                  \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    "urhadd     v0.16b, v0.16b, v1.16b         \n"
    MEMACCESS(0)
    "st1        {v0.16b}, [%0], #16            \n"
    "b.gt       75b                            \n"
    "b          99f                            \n"
    // Blend 100 / 0 - Copy row unchanged.
  "100:                                        \n"
    MEMACCESS(1)
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -5571,12 +5571,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    cmp        eax, 0
    je         xloop100  // 0 / 128.  Blend 100 / 0.
    sub        edi, esi
    cmp        eax, 32
    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
    cmp        eax, 64
    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
    cmp        eax, 96
    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
    vmovd      xmm0, eax  // high fraction 0..127
    neg        eax
@ -5587,6 +5583,10 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    vpxor      ymm0, ymm0, ymm0
    vpermd     ymm5, ymm0, ymm5
    mov        eax, 0x00400040  // 64 for rounding.
    vmovd      xmm4, eax
    vbroadcastss ymm4, xmm4
  xloop:
    vmovdqu    ymm0, [esi]
    vmovdqu    ymm2, [esi + edx]
@ -5594,6 +5594,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    vpunpcklbw ymm0, ymm0, ymm2  // mutates
    vpmaddubsw ymm0, ymm0, ymm5
    vpmaddubsw ymm1, ymm1, ymm5
    vpaddw     ymm0, ymm0, ymm4
    vpaddw     ymm1, ymm1, ymm4
    vpsrlw     ymm0, ymm0, 7
    vpsrlw     ymm1, ymm1, 7
    vpackuswb  ymm0, ymm0, ymm1  // unmutates
@ -5603,18 +5605,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop
    jmp        xloop99
   // Blend 25 / 75.
 xloop25:
   vmovdqu    ymm0, [esi]
   vmovdqu    ymm1, [esi + edx]
   vpavgb     ymm0, ymm0, ymm1
   vpavgb     ymm0, ymm0, ymm1
   vmovdqu    [esi + edi], ymm0
   lea        esi, [esi + 32]
   sub        ecx, 32
   jg         xloop25
   jmp        xloop99
   // Blend 50 / 50.
 xloop50:
   vmovdqu    ymm0, [esi]
@ -5625,18 +5615,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
   jg         xloop50
   jmp        xloop99
   // Blend 75 / 25.
 xloop75:
   vmovdqu    ymm1, [esi]
   vmovdqu    ymm0, [esi + edx]
   vpavgb     ymm0, ymm0, ymm1
   vpavgb     ymm0, ymm0, ymm1
   vmovdqu    [esi + edi], ymm0
   lea        esi, [esi + 32]
   sub        ecx, 32
   jg         xloop75
   jmp        xloop99
   // Blend 100 / 0 - Copy row unchanged.
 xloop100:
   rep movsb
@ -5668,12 +5646,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    // Dispatch to specialized filters if applicable.
    cmp        eax, 0
    je         xloop100  // 0 / 128.  Blend 100 / 0.
    cmp        eax, 32
    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.
    cmp        eax, 64
    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.
    cmp        eax, 96
    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.
    movd       xmm0, eax  // high fraction 0..127
    neg        eax
@ -5683,6 +5657,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    punpcklwd  xmm5, xmm5
    pshufd     xmm5, xmm5, 0
    mov        eax, 0x00400040  // 64 for rounding.
    movd       xmm4, eax
    pshufd     xmm4, xmm4, 0x00
  xloop:
    movdqu     xmm0, [esi]
    movdqu     xmm2, [esi + edx]
@ -5691,6 +5669,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    punpckhbw  xmm1, xmm2
    pmaddubsw  xmm0, xmm5
    pmaddubsw  xmm1, xmm5
    paddw      xmm0, xmm4
    paddw      xmm1, xmm4
    psrlw      xmm0, 7
    psrlw      xmm1, 7
    packuswb   xmm0, xmm1
@ -5700,18 +5680,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop
    jmp        xloop99
    // Blend 25 / 75.
  xloop25:
    movdqu     xmm0, [esi]
    movdqu     xmm1, [esi + edx]
    pavgb      xmm0, xmm1
    pavgb      xmm0, xmm1
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop25
    jmp        xloop99
    // Blend 50 / 50.
  xloop50:
    movdqu     xmm0, [esi]
@ -5723,18 +5691,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop50
    jmp        xloop99
    // Blend 75 / 25.
  xloop75:
    movdqu     xmm1, [esi]
    movdqu     xmm0, [esi + edx]
    pavgb      xmm0, xmm1
    pavgb      xmm0, xmm1
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop75
    jmp        xloop99
    // Blend 100 / 0 - Copy row unchanged.
  xloop100:
    movdqu     xmm0, [esi]
@ -5750,114 +5706,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
  }
 }
 #ifdef HAS_INTERPOLATEROW_SSE2
 // Bilinear filter 16x2 -> 16x1
 __declspec(naked)
 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
                         ptrdiff_t src_stride, int dst_width,
                         int source_y_fraction) {
  __asm {
    push       esi
    push       edi
    mov        edi, [esp + 8 + 4]   // dst_ptr
    mov        esi, [esp + 8 + 8]   // src_ptr
    mov        edx, [esp + 8 + 12]  // src_stride
    mov        ecx, [esp + 8 + 16]  // dst_width
    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
    sub        edi, esi
    // Dispatch to specialized filters if applicable.
    cmp        eax, 0
    je         xloop100  // 0 / 256.  Blend 100 / 0.
    cmp        eax, 64
    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
    cmp        eax, 128
    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
    cmp        eax, 192
    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
    movd       xmm5, eax            // xmm5 = y fraction
    punpcklbw  xmm5, xmm5
    psrlw      xmm5, 1
    punpcklwd  xmm5, xmm5
    punpckldq  xmm5, xmm5
    punpcklqdq xmm5, xmm5
    pxor       xmm4, xmm4
  xloop:
    movdqu     xmm0, [esi]  // row0
    movdqu     xmm2, [esi + edx]  // row1
    movdqu     xmm1, xmm0
    movdqu     xmm3, xmm2
    punpcklbw  xmm2, xmm4
    punpckhbw  xmm3, xmm4
    punpcklbw  xmm0, xmm4
    punpckhbw  xmm1, xmm4
    psubw      xmm2, xmm0  // row1 - row0
    psubw      xmm3, xmm1
    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
    paddw      xmm3, xmm3
    pmulhw     xmm2, xmm5  // scale diff
    pmulhw     xmm3, xmm5
    paddw      xmm0, xmm2  // sum rows
    paddw      xmm1, xmm3
    packuswb   xmm0, xmm1
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop
    jmp        xloop99
    // Blend 25 / 75.
  xloop25:
    movdqu     xmm0, [esi]
    movdqu     xmm1, [esi + edx]
    pavgb      xmm0, xmm1
    pavgb      xmm0, xmm1
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop25
    jmp        xloop99
    // Blend 50 / 50.
  xloop50:
    movdqu     xmm0, [esi]
    movdqu     xmm1, [esi + edx]
    pavgb      xmm0, xmm1
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop50
    jmp        xloop99
    // Blend 75 / 25.
  xloop75:
    movdqu     xmm1, [esi]
    movdqu     xmm0, [esi + edx]
    pavgb      xmm0, xmm1
    pavgb      xmm0, xmm1
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop75
    jmp        xloop99
    // Blend 100 / 0 - Copy row unchanged.
  xloop100:
    movdqu     xmm0, [esi]
    movdqu     [esi + edi], xmm0
    lea        esi, [esi + 16]
    sub        ecx, 16
    jg         xloop100
  xloop99:
    pop        edi
    pop        esi
    ret
  }
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 __declspec(naked)
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
--- a/source/scale.cc
+++ b/source/scale.cc
@ -875,14 +875,6 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
             &x, &y, &dx, &dy);
  src_width = Abs(src_width);
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(src_width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
@ -1072,14 +1064,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
             &x, &y, &dx, &dy);
  src_width = Abs(src_width);
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -210,14 +210,6 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
  clip_src_width = (int)(xr - xl) * 4;  // Width aligned to 4.
  src_argb += xl * 4;
  x -= (int)(xl << 16);
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(clip_src_width, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
@ -308,14 +300,6 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
      int dst_width, int x, int dx) =
      filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
  const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width, 4)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
@ -494,14 +478,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
      InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width, 4)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -876,14 +876,6 @@ void ScalePlaneVertical(int src_height,
  assert(dst_width > 0);
  assert(dst_height > 0);
  src_argb += (x >> 16) * bpp;
 #if defined(HAS_INTERPOLATEROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    InterpolateRow = InterpolateRow_Any_SSE2;
    if (IS_ALIGNED(dst_width_bytes, 16)) {
      InterpolateRow = InterpolateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    InterpolateRow = InterpolateRow_Any_SSSE3;
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -909,15 +909,15 @@ TEST_F(LibYUVPlanarTest, TestARGBInterpolate) {
  EXPECT_EQ(0u, interpolate_pixels[1][0]);
  EXPECT_EQ(0u, interpolate_pixels[1][1]);
  EXPECT_EQ(0u, interpolate_pixels[1][2]);
-  EXPECT_NEAR(128u, interpolate_pixels[1][3], 1);  // C = 127, SSE = 128.
+  EXPECT_EQ(128u, interpolate_pixels[1][3]);
  EXPECT_EQ(0u, interpolate_pixels[2][0]);
  EXPECT_EQ(0u, interpolate_pixels[2][1]);
  EXPECT_EQ(0u, interpolate_pixels[2][2]);
  EXPECT_EQ(0u, interpolate_pixels[2][3]);
-  EXPECT_NEAR(128u, interpolate_pixels[3][0], 1);
+  EXPECT_EQ(128u, interpolate_pixels[3][0]);
-  EXPECT_NEAR(128u, interpolate_pixels[3][1], 1);
+  EXPECT_EQ(128u, interpolate_pixels[3][1]);
-  EXPECT_NEAR(128u, interpolate_pixels[3][2], 1);
+  EXPECT_EQ(128u, interpolate_pixels[3][2]);
-  EXPECT_NEAR(128u, interpolate_pixels[3][3], 1);
+  EXPECT_EQ(128u, interpolate_pixels[3][3]);
  ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0,
                  &interpolate_pixels[0][0], 0, 4, 1, 0);
@ -991,15 +991,15 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
  EXPECT_EQ(0u, interpolate_pixels[4]);
  EXPECT_EQ(0u, interpolate_pixels[5]);
  EXPECT_EQ(0u, interpolate_pixels[6]);
-  EXPECT_NEAR(128u, interpolate_pixels[7], 1);  // C = 127, SSE = 128.
+  EXPECT_EQ(128u, interpolate_pixels[7]);
  EXPECT_EQ(0u, interpolate_pixels[8]);
  EXPECT_EQ(0u, interpolate_pixels[9]);
  EXPECT_EQ(0u, interpolate_pixels[10]);
  EXPECT_EQ(0u, interpolate_pixels[11]);
-  EXPECT_NEAR(128u, interpolate_pixels[12], 1);
+  EXPECT_EQ(128u, interpolate_pixels[12]);
-  EXPECT_NEAR(128u, interpolate_pixels[13], 1);
+  EXPECT_EQ(128u, interpolate_pixels[13]);
-  EXPECT_NEAR(128u, interpolate_pixels[14], 1);
+  EXPECT_EQ(128u, interpolate_pixels[14]);
-  EXPECT_NEAR(128u, interpolate_pixels[15], 1);
+  EXPECT_EQ(128u, interpolate_pixels[15]);
  InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
                   &interpolate_pixels[0], 0, 16, 1, 0);
@ -1013,12 +1013,12 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) {
  EXPECT_EQ(4u, interpolate_pixels[0]);
  EXPECT_EQ(8u, interpolate_pixels[1]);
-  EXPECT_EQ(16u,interpolate_pixels[2]);
+  EXPECT_EQ(16u, interpolate_pixels[2]);
  EXPECT_EQ(32u, interpolate_pixels[3]);
  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
    InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0,
-                     &interpolate_pixels[0], 0, 1280, 1, 128);
+                     &interpolate_pixels[0], 0, 1280, 1, 123);
  }
 }