diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4fa8343c3..f7620eea1 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -175,7 +175,6 @@ extern "C" { #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#define HAS_INTERPOLATEROW_SSE2 #define HAS_INTERPOLATEROW_SSSE3 #define HAS_RGBCOLORTABLEROW_X86 #define HAS_SOBELROW_SSE2 @@ -1838,9 +1837,6 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); @@ -1856,9 +1852,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride_ptr, int width, - int source_y_fraction); void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 062da932f..536e1d528 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1870,14 +1870,6 @@ int InterpolatePlane(const uint8* src0, int src_stride0, height = 1; src_stride0 = src_stride1 = dst_stride = 0; } -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2467,14 +2459,6 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, } } #endif -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -2571,14 +2555,6 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, } } #endif -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index fef7ecd9a..5e5f435a6 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -593,9 +593,6 @@ ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) #ifdef HAS_INTERPOLATEROW_SSSE3 ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #endif -#ifdef HAS_INTERPOLATEROW_SSE2 -ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15) -#endif #ifdef HAS_INTERPOLATEROW_NEON ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif diff --git a/source/row_common.cc b/source/row_common.cc index 058e21d91..60d5b8c53 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2211,27 +2211,30 @@ static void HalfRow_16_C(const uint16* src_uv, int src_uv_stride, void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; + int y1_fraction = source_y_fraction >> 1; + int y0_fraction = 128 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; int x; - if (source_y_fraction == 0) { + if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); return; } - if (source_y_fraction == 128) { + if (y1_fraction == 64) { HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width); return; } for (x = 0; x < width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 64) >> 7; + dst_ptr[1] = + (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 64) >> 7; src_ptr += 2; src_ptr1 += 2; dst_ptr += 2; } if (width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; + dst_ptr[0] = + (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 64) >> 7; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0ccf76b30..d1b25140f 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4794,12 +4794,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "shr %3 \n" "cmp $0x0,%3 \n" "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" "cmp $0x40,%3 \n" "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" "movd %3,%%xmm0 \n" "neg %3 \n" @@ -4808,6 +4804,9 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" + "mov $0x400040,%%eax \n" + "movd %%eax,%%xmm4 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" // General purpose row blend. LABELALIGN @@ -4819,6 +4818,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "punpckhbw %%xmm2,%%xmm1 \n" "pmaddubsw %%xmm5,%%xmm0 \n" "pmaddubsw %%xmm5,%%xmm1 \n" + "paddw %%xmm4,%%xmm0 \n" + "paddw %%xmm4,%%xmm1 \n" "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" @@ -4828,19 +4829,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "jg 1b \n" "jmp 99f \n" - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 25b \n" - "jmp 99f \n" - // Blend 50 / 50. LABELALIGN "50: \n" @@ -4853,19 +4841,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "jg 50b \n" "jmp 99f \n" - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 75b \n" - "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" @@ -4881,8 +4856,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm5" + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" ); } #endif // HAS_INTERPOLATEROW_SSSE3 @@ -4897,12 +4872,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "cmp $0x0,%3 \n" "je 100f \n" "sub %1,%0 \n" - "cmp $0x20,%3 \n" - "je 75f \n" "cmp $0x40,%3 \n" "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" "vmovd %3,%%xmm0 \n" "neg %3 \n" @@ -4912,6 +4883,9 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpermd %%ymm5,%%ymm0,%%ymm5 \n" + "mov $0x400040,%%eax \n" + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" // General purpose row blend. LABELALIGN @@ -4922,6 +4896,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm5,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm5,%%ymm1,%%ymm1 \n" + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" "vpsrlw $0x7,%%ymm0,%%ymm0 \n" "vpsrlw $0x7,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" @@ -4931,19 +4907,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "jg 1b \n" "jmp 99f \n" - // Blend 25 / 75. - LABELALIGN - "25: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm0 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm1) - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 25b \n" - "jmp 99f \n" - // Blend 50 / 50. LABELALIGN "50: \n" @@ -4955,19 +4918,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "jg 50b \n" "jmp 99f \n" - // Blend 75 / 25. - LABELALIGN - "75: \n" - "vmovdqu " MEMACCESS(1) ",%%ymm1 \n" - MEMOPREG(vmovdqu,0x00,1,4,1,ymm0) - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - "vpavgb %%ymm1,%%ymm0,%%ymm0 \n" - MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1) - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x20,%2 \n" - "jg 75b \n" - "jmp 99f \n" - // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" @@ -4982,123 +4932,12 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, "+c"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm5" + : "memory", "cc", "eax", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm4", "xmm5" ); } #endif // HAS_INTERPOLATEROW_AVX2 -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - // General purpose row blend. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm3 \n" - "paddw %%xmm2,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "pmulhw %%xmm5,%%xmm2 \n" - "pmulhw %%xmm5,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - LABELALIGN - "25: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - LABELALIGN - "50: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - LABELALIGN - "75: \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - LABELALIGN - "100: \n" - "movdqu " MEMACCESS(1) ",%%xmm0 \n" - MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x10,%2 \n" - "jg 100b \n" - - "99: \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"((intptr_t)(src_stride)) // %4 - : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_INTERPOLATEROW_SSE2 - #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, diff --git a/source/row_neon.cc b/source/row_neon.cc index f4f2e8559..2522d501a 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2259,19 +2259,16 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { + int y1_fraction = source_y_fraction >> 1; asm volatile ( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" "cmp %4, #64 \n" - "beq 75f \n" - "cmp %4, #128 \n" "beq 50f \n" - "cmp %4, #192 \n" - "beq 25f \n" "vdup.8 d5, %4 \n" - "rsb %4, #256 \n" + "rsb %4, #128 \n" "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" @@ -2284,27 +2281,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vmull.u8 q14, d1, d4 \n" "vmlal.u8 q13, d2, d5 \n" "vmlal.u8 q14, d3, d5 \n" - "vrshrn.u16 d0, q13, #8 \n" - "vrshrn.u16 d1, q14, #8 \n" + "vrshrn.u16 d0, q13, #7 \n" + "vrshrn.u16 d1, q14, #7 \n" MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "vld1.8 {q0}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q1}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 25b \n" - "b 99f \n" - // Blend 50 / 50. "50: \n" MEMACCESS(1) @@ -2318,20 +2301,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, "bgt 50b \n" "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" - MEMACCESS(2) - "vld1.8 {q0}, [%2]! \n" - "subs %3, %3, #16 \n" - "vrhadd.u8 q0, q1 \n" - "vrhadd.u8 q0, q1 \n" - MEMACCESS(0) - "vst1.8 {q0}, [%0]! \n" - "bgt 75b \n" - "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) @@ -2346,7 +2315,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "+r"(src_ptr), // %1 "+r"(src_stride), // %2 "+r"(dst_width), // %3 - "+r"(source_y_fraction) // %4 + "+r"(y1_fraction) // %4 : : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" ); diff --git a/source/row_neon64.cc b/source/row_neon64.cc index a9801f2e4..f62a34bfb 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2336,18 +2336,14 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; + int y1_fraction = source_y_fraction >> 1; + int y0_fraction = 128 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; asm volatile ( "cmp %w4, #0 \n" "b.eq 100f \n" "cmp %w4, #64 \n" - "b.eq 75f \n" - "cmp %w4, #128 \n" "b.eq 50f \n" - "cmp %w4, #192 \n" - "b.eq 25f \n" "dup v5.16b, %w4 \n" "dup v4.16b, %w5 \n" @@ -2362,27 +2358,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, "umull2 v3.8h, v0.16b, v4.16b \n" "umlal v2.8h, v1.8b, v5.8b \n" "umlal2 v3.8h, v1.16b, v5.16b \n" - "rshrn v0.8b, v2.8h, #8 \n" - "rshrn2 v0.16b, v3.8h, #8 \n" + "rshrn v0.8b, v2.8h, #7 \n" + "rshrn2 v0.16b, v3.8h, #7 \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" "b.gt 1b \n" "b 99f \n" - // Blend 25 / 75. - "25: \n" - MEMACCESS(1) - "ld1 {v0.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v1.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 25b \n" - "b 99f \n" - // Blend 50 / 50. "50: \n" MEMACCESS(1) @@ -2396,20 +2378,6 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b.gt 50b \n" "b 99f \n" - // Blend 75 / 25. - "75: \n" - MEMACCESS(1) - "ld1 {v1.16b}, [%1], #16 \n" - MEMACCESS(2) - "ld1 {v0.16b}, [%2], #16 \n" - "subs %w3, %w3, #16 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - MEMACCESS(0) - "st1 {v0.16b}, [%0], #16 \n" - "b.gt 75b \n" - "b 99f \n" - // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) diff --git a/source/row_win.cc b/source/row_win.cc index 084fc0444..494043c62 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5571,12 +5571,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, cmp eax, 0 je xloop100 // 0 / 128. Blend 100 / 0. sub edi, esi - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. cmp eax, 64 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. vmovd xmm0, eax // high fraction 0..127 neg eax @@ -5587,6 +5583,10 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpxor ymm0, ymm0, ymm0 vpermd ymm5, ymm0, ymm5 + mov eax, 0x00400040 // 64 for rounding. + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + xloop: vmovdqu ymm0, [esi] vmovdqu ymm2, [esi + edx] @@ -5594,6 +5594,8 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, vpunpcklbw ymm0, ymm0, ymm2 // mutates vpmaddubsw ymm0, ymm0, ymm5 vpmaddubsw ymm1, ymm1, ymm5 + vpaddw ymm0, ymm0, ymm4 + vpaddw ymm1, ymm1, ymm4 vpsrlw ymm0, ymm0, 7 vpsrlw ymm1, ymm1, 7 vpackuswb ymm0, ymm0, ymm1 // unmutates @@ -5603,18 +5605,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop jmp xloop99 - // Blend 25 / 75. - xloop25: - vmovdqu ymm0, [esi] - vmovdqu ymm1, [esi + edx] - vpavgb ymm0, ymm0, ymm1 - vpavgb ymm0, ymm0, ymm1 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop25 - jmp xloop99 - // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] @@ -5625,18 +5615,6 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 75 / 25. - xloop75: - vmovdqu ymm1, [esi] - vmovdqu ymm0, [esi + edx] - vpavgb ymm0, ymm0, ymm1 - vpavgb ymm0, ymm0, ymm1 - vmovdqu [esi + edi], ymm0 - lea esi, [esi + 32] - sub ecx, 32 - jg xloop75 - jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb @@ -5668,12 +5646,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 / 128. Blend 100 / 0. - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. cmp eax, 64 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. movd xmm0, eax // high fraction 0..127 neg eax @@ -5683,6 +5657,10 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 + mov eax, 0x00400040 // 64 for rounding. + movd xmm4, eax + pshufd xmm4, xmm4, 0x00 + xloop: movdqu xmm0, [esi] movdqu xmm2, [esi + edx] @@ -5691,6 +5669,8 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, punpckhbw xmm1, xmm2 pmaddubsw xmm0, xmm5 pmaddubsw xmm1, xmm5 + paddw xmm0, xmm4 + paddw xmm1, xmm4 psrlw xmm0, 7 psrlw xmm1, 7 packuswb xmm0, xmm1 @@ -5700,18 +5680,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jg xloop jmp xloop99 - // Blend 25 / 75. - xloop25: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop25 - jmp xloop99 - // Blend 50 / 50. xloop50: movdqu xmm0, [esi] @@ -5723,18 +5691,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, jg xloop50 jmp xloop99 - // Blend 75 / 25. - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop75 - jmp xloop99 - // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] @@ -5750,114 +5706,6 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } } -#ifdef HAS_INTERPOLATEROW_SSE2 -// Bilinear filter 16x2 -> 16x1 -__declspec(naked) -void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - cmp eax, 64 - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - cmp eax, 192 - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. - - movd xmm5, eax // xmm5 = y fraction - punpcklbw xmm5, xmm5 - psrlw xmm5, 1 - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - punpcklqdq xmm5, xmm5 - pxor xmm4, xmm4 - - xloop: - movdqu xmm0, [esi] // row0 - movdqu xmm2, [esi + edx] // row1 - movdqu xmm1, xmm0 - movdqu xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - psubw xmm2, xmm0 // row1 - row0 - psubw xmm3, xmm1 - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 - paddw xmm3, xmm3 - pmulhw xmm2, xmm5 // scale diff - pmulhw xmm3, xmm5 - paddw xmm0, xmm2 // sum rows - paddw xmm1, xmm3 - packuswb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop - jmp xloop99 - - // Blend 25 / 75. - xloop25: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - xloop100: - movdqu xmm0, [esi] - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg xloop100 - - xloop99: - pop edi - pop esi - ret - } -} -#endif // HAS_INTERPOLATEROW_SSE2 - // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, diff --git a/source/scale.cc b/source/scale.cc index 0dfd99b20..595314f35 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -875,14 +875,6 @@ void ScalePlaneBilinearDown(int src_width, int src_height, &x, &y, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(src_width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -1072,14 +1064,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height, &x, &y, &dx, &dy); src_width = Abs(src_width); -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 132581cce..adddf9db5 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -210,14 +210,6 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. src_argb += xl * 4; x -= (int)(xl << 16); -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(clip_src_width, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -308,14 +300,6 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; @@ -494,14 +478,6 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width, 4)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; diff --git a/source/scale_common.cc b/source/scale_common.cc index 1711f3d54..f5c908d0b 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -876,14 +876,6 @@ void ScalePlaneVertical(int src_height, assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * bpp; -#if defined(HAS_INTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - InterpolateRow = InterpolateRow_Any_SSE2; - if (IS_ALIGNED(dst_width_bytes, 16)) { - InterpolateRow = InterpolateRow_SSE2; - } - } -#endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index f31e45f79..c18e01417 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -909,15 +909,15 @@ TEST_F(LibYUVPlanarTest, TestARGBInterpolate) { EXPECT_EQ(0u, interpolate_pixels[1][0]); EXPECT_EQ(0u, interpolate_pixels[1][1]); EXPECT_EQ(0u, interpolate_pixels[1][2]); - EXPECT_NEAR(128u, interpolate_pixels[1][3], 1); // C = 127, SSE = 128. + EXPECT_EQ(128u, interpolate_pixels[1][3]); EXPECT_EQ(0u, interpolate_pixels[2][0]); EXPECT_EQ(0u, interpolate_pixels[2][1]); EXPECT_EQ(0u, interpolate_pixels[2][2]); EXPECT_EQ(0u, interpolate_pixels[2][3]); - EXPECT_NEAR(128u, interpolate_pixels[3][0], 1); - EXPECT_NEAR(128u, interpolate_pixels[3][1], 1); - EXPECT_NEAR(128u, interpolate_pixels[3][2], 1); - EXPECT_NEAR(128u, interpolate_pixels[3][3], 1); + EXPECT_EQ(128u, interpolate_pixels[3][0]); + EXPECT_EQ(128u, interpolate_pixels[3][1]); + EXPECT_EQ(128u, interpolate_pixels[3][2]); + EXPECT_EQ(128u, interpolate_pixels[3][3]); ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, &interpolate_pixels[0][0], 0, 4, 1, 0); @@ -991,15 +991,15 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) { EXPECT_EQ(0u, interpolate_pixels[4]); EXPECT_EQ(0u, interpolate_pixels[5]); EXPECT_EQ(0u, interpolate_pixels[6]); - EXPECT_NEAR(128u, interpolate_pixels[7], 1); // C = 127, SSE = 128. + EXPECT_EQ(128u, interpolate_pixels[7]); EXPECT_EQ(0u, interpolate_pixels[8]); EXPECT_EQ(0u, interpolate_pixels[9]); EXPECT_EQ(0u, interpolate_pixels[10]); EXPECT_EQ(0u, interpolate_pixels[11]); - EXPECT_NEAR(128u, interpolate_pixels[12], 1); - EXPECT_NEAR(128u, interpolate_pixels[13], 1); - EXPECT_NEAR(128u, interpolate_pixels[14], 1); - EXPECT_NEAR(128u, interpolate_pixels[15], 1); + EXPECT_EQ(128u, interpolate_pixels[12]); + EXPECT_EQ(128u, interpolate_pixels[13]); + EXPECT_EQ(128u, interpolate_pixels[14]); + EXPECT_EQ(128u, interpolate_pixels[15]); InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, &interpolate_pixels[0], 0, 16, 1, 0); @@ -1013,12 +1013,12 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane) { EXPECT_EQ(4u, interpolate_pixels[0]); EXPECT_EQ(8u, interpolate_pixels[1]); - EXPECT_EQ(16u,interpolate_pixels[2]); + EXPECT_EQ(16u, interpolate_pixels[2]); EXPECT_EQ(32u, interpolate_pixels[3]); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, - &interpolate_pixels[0], 0, 1280, 1, 128); + &interpolate_pixels[0], 0, 1280, 1, 123); } }