From b911428afd3994f47e5780a80c876d05d1d4c590 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 30 May 2013 23:42:27 +0000 Subject: [PATCH] Adapt row interpolator to do YUV as well as ARGB without extrude so it can be used in I420Scale. BUG=237 TEST=Scale* R=ryanpetrie@google.com Review URL: https://webrtc-codereview.appspot.com/1587004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@710 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 65 +-- include/libyuv/version.h | 2 +- source/planar_functions.cc | 43 +- source/row_any.cc | 38 +- source/row_common.cc | 21 +- source/row_mips.cc | 57 +++ source/row_neon.cc | 19 +- source/row_posix.cc | 94 ++-- source/row_win.cc | 92 ++-- source/scale.cc | 940 +++-------------------------------- source/scale_argb.cc | 111 +++-- source/scale_mips.cc | 58 --- unit_test/scale_argb_test.cc | 78 ++- unit_test/scale_test.cc | 50 +- 15 files changed, 475 insertions(+), 1195 deletions(-) diff --git a/README.chromium b/README.chromium index 4f9044897..e4a82a09d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 709 +Version: 710 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 2c17a3585..eccad6582 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -109,8 +109,8 @@ extern "C" { #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBINTERPOLATEROW_SSE2 -#define HAS_ARGBINTERPOLATEROW_SSSE3 +#define HAS_INTERPOLATEROW_SSE2 +#define HAS_INTERPOLATEROW_SSSE3 #define HAS_ARGBMIRRORROW_SSSE3 #define HAS_ARGBMULTIPLYROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2 @@ -261,7 +261,6 @@ extern "C" { #define HAS_ARGBBLENDROW_NEON #define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON -#define HAS_ARGBINTERPOLATEROW_NEON #define HAS_ARGBMIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON @@ -272,6 +271,7 @@ extern "C" { #define HAS_SOBELXYROW_NEON #define HAS_SOBELXROW_NEON #define HAS_SOBELYROW_NEON +#define HAS_INTERPOLATEROW_NEON #endif // The following are available on Mips platforms @@ -281,6 +281,7 @@ extern "C" { #define HAS_I422TOABGRROW_MIPS_DSPR2 #define HAS_I422TOARGBROW_MIPS_DSPR2 #define HAS_I422TOBGRAROW_MIPS_DSPR2 +#define HAS_INTERPOLATEROWS_MIPS_DSPR2 #define HAS_MIRRORROW_MIPS_DSPR2 #define HAS_MIRRORUVROW_MIPS_DSPR2 #define HAS_SPLITUVROW_MIPS_DSPR2 @@ -1455,34 +1456,40 @@ LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width); -// Used for ARGBScale and ARGBInterpolate. -void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, - int width, int source_y_fraction); -void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, +// Used for I420Scale, ARGBScale, and ARGBInterpolate. +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, + int width, int source_y_fraction); +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, +void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); +void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, int source_y_fraction); -void ARGBInterpolateRow_NEON(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, - int source_y_fraction); -void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, - int source_y_fraction); -void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, - int source_y_fraction); -void ARGBInterpolateRow_Any_NEON(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, - int source_y_fraction); -void ARGBInterpolateRow_Any_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, - int source_y_fraction); -void ARGBInterpolateRow_Any_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride_argb, int width, - int source_y_fraction); +void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride_ptr, int width, + int source_y_fraction); // Sobel images. void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 779b8f00d..4a1170669 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 709 +#define LIBYUV_VERSION 710 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2133f4e2b..09e2f5368 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1617,7 +1617,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, // Interpolate 2 ARGB images by specified amount (0 to 255). // TODO(fbarchard): Consider selecting a specialization for interpolation so -// row function doesn't need to check interpolation on each row. +// row function doesn't need to check interpolation on each row. LIBYUV_API int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, const uint8* src_argb1, int src_stride_argb1, @@ -1642,46 +1642,55 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, width * height, 1, interpolation); } - void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) = ARGBInterpolateRow_C; -#if defined(HAS_ARGBINTERPOLATEROW_SSE2) + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { - ARGBInterpolateRow = ARGBInterpolateRow_Any_SSE2; + InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { - ARGBInterpolateRow = ARGBInterpolateRow_Unaligned_SSE2; + InterpolateRow = InterpolateRow_Unaligned_SSE2; if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBInterpolateRow = ARGBInterpolateRow_SSE2; + InterpolateRow = InterpolateRow_SSE2; } } } #endif -#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) +#if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { - ARGBInterpolateRow = ARGBInterpolateRow_Any_SSSE3; + InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBInterpolateRow = ARGBInterpolateRow_Unaligned_SSSE3; + InterpolateRow = InterpolateRow_Unaligned_SSSE3; if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGBInterpolateRow = ARGBInterpolateRow_SSSE3; + InterpolateRow = InterpolateRow_SSSE3; } } } #endif -#if defined(HAS_ARGBINTERPOLATEROW_NEON) +#if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 4) { - ARGBInterpolateRow = ARGBInterpolateRow_Any_NEON; + InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(width, 4)) { - ARGBInterpolateRow = ARGBInterpolateRow_NEON; + InterpolateRow = InterpolateRow_NEON; } } #endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 && + IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) && + IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2; + } +#endif + for (int y = 0; y < height; ++y) { - ARGBInterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, - width, interpolation); + InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0, + width * 4, interpolation); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; diff --git a/source/row_any.cc b/source/row_any.cc index 83afb420e..72100d90e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -483,29 +483,33 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, #undef YANY // Interpolate may want to work in place, so last16 method can not be used. -#define NANY(NAMEANY, ARGBTERP_SIMD, ARGBTERP_C, SBPP, BPP, MASK) \ - void NAMEANY(uint8* dst_argb, const uint8* src_argb, \ - ptrdiff_t src_stride_argb, int width, \ +#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, \ int source_y_fraction) { \ int n = width & ~MASK; \ - ARGBTERP_SIMD(dst_argb, src_argb, src_stride_argb, \ - n, source_y_fraction); \ - ARGBTERP_C(dst_argb + n * BPP, \ - src_argb + n * SBPP, src_stride_argb, \ - width & MASK, source_y_fraction); \ + TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, \ + n, source_y_fraction); \ + TERP_C(dst_ptr + n * BPP, \ + src_ptr + n * SBPP, src_stride_ptr, \ + width & MASK, source_y_fraction); \ } -#ifdef HAS_ARGBINTERPOLATEROW_SSSE3 -NANY(ARGBInterpolateRow_Any_SSSE3, ARGBInterpolateRow_Unaligned_SSSE3, - ARGBInterpolateRow_C, 4, 4, 3) +#ifdef HAS_INTERPOLATEROW_SSSE3 +NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3, + InterpolateRow_C, 1, 1, 15) #endif -#ifdef HAS_ARGBINTERPOLATEROW_SSE2 -NANY(ARGBInterpolateRow_Any_SSE2, ARGBInterpolateRow_Unaligned_SSE2, - ARGBInterpolateRow_C, 4, 4, 3) +#ifdef HAS_INTERPOLATEROW_SSE2 +NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2, + InterpolateRow_C, 1, 1, 15) #endif -#ifdef HAS_ARGBINTERPOLATEROW_NEON -NANY(ARGBInterpolateRow_Any_NEON, ARGBInterpolateRow_NEON, - ARGBInterpolateRow_C, 4, 4, 3) +#ifdef HAS_INTERPOLATEROW_NEON +NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, + InterpolateRow_C, 1, 1, 15) +#endif +#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 +NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, + InterpolateRow_C, 1, 1, 3) #endif #undef NANY diff --git a/source/row_common.cc b/source/row_common.cc index 43f79a329..badea4405 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1775,9 +1775,9 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, } // C version 2x2 -> 2x1. -void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, - int width, int source_y_fraction) { +void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, + int width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; @@ -1785,21 +1785,12 @@ void ARGBInterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr, for (int x = 0; x < width - 1; x += 2) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; - dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; - dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; - dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; - dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; - dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; - src_ptr += 8; - src_ptr1 += 8; - dst_ptr += 8; + src_ptr += 2; + src_ptr1 += 2; + dst_ptr += 2; } if (width & 1) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; - dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; } } diff --git a/source/row_mips.cc b/source/row_mips.cc index c4f1e773b..69677aa2d 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -909,6 +909,63 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, "s4", "s5", "s6" ); } + +// Bilinear filter 8x2 -> 8x1 +void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + int y0_fraction = 256 - source_y_fraction; + const uint8* src_ptr1 = src_ptr + src_stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "replv.ph $t0, %[y0_fraction] \n" + "replv.ph $t1, %[source_y_fraction] \n" + "1: \n" + "lw $t2, 0(%[src_ptr]) \n" + "lw $t3, 0(%[src_ptr1]) \n" + "lw $t4, 4(%[src_ptr]) \n" + "lw $t5, 4(%[src_ptr1]) \n" + "muleu_s.ph.qbl $t6, $t2, $t0 \n" + "muleu_s.ph.qbr $t7, $t2, $t0 \n" + "muleu_s.ph.qbl $t8, $t3, $t1 \n" + "muleu_s.ph.qbr $t9, $t3, $t1 \n" + "muleu_s.ph.qbl $t2, $t4, $t0 \n" + "muleu_s.ph.qbr $t3, $t4, $t0 \n" + "muleu_s.ph.qbl $t4, $t5, $t1 \n" + "muleu_s.ph.qbr $t5, $t5, $t1 \n" + "addq.ph $t6, $t6, $t8 \n" + "addq.ph $t7, $t7, $t9 \n" + "addq.ph $t2, $t2, $t4 \n" + "addq.ph $t3, $t3, $t5 \n" + "shra.ph $t6, $t6, 8 \n" + "shra.ph $t7, $t7, 8 \n" + "shra.ph $t2, $t2, 8 \n" + "shra.ph $t3, $t3, 8 \n" + "precr.qb.ph $t6, $t6, $t7 \n" + "precr.qb.ph $t2, $t2, $t3 \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[src_ptr1], %[src_ptr1], 8 \n" + "addiu %[dst_width], %[dst_width], -8 \n" + "sw $t6, 0(%[dst_ptr]) \n" + "sw $t2, 4(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[dst_ptr], %[dst_ptr], 8 \n" + + ".set pop \n" + : [dst_ptr] "+r" (dst_ptr), + [src_ptr1] "+r" (src_ptr1), + [src_ptr] "+r" (src_ptr), + [dst_width] "+r" (dst_width) + : [source_y_fraction] "r" (source_y_fraction), + [y0_fraction] "r" (y0_fraction), + [src_stride] "r" (src_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} #endif // __mips_dsp_rev >= 2 #ifdef __cplusplus diff --git a/source/row_neon.cc b/source/row_neon.cc index 82587a334..53da16afa 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2161,11 +2161,10 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { ); } -// 4x2 -> 4x1 -// Same as ScaleARGBFilterRows_NEON but with last pixel not duplicated. -void ARGBInterpolateRow_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_NEON(uint8* dst_ptr, + const uint8* src_ptr, ptrdiff_t src_stride, + int dst_width, int source_y_fraction) { asm volatile ( "cmp %4, #0 \n" "beq 100f \n" @@ -2184,7 +2183,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr, "1: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" - "subs %3, #4 \n" + "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" "vmull.u8 q14, d1, d4 \n" "vmlal.u8 q13, d2, d5 \n" @@ -2199,7 +2198,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr, "25: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" - "subs %3, #4 \n" + "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" @@ -2210,7 +2209,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr, "50: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" - "subs %3, #4 \n" + "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" "bgt 50b \n" @@ -2220,7 +2219,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr, "75: \n" "vld1.u8 {q1}, [%1]! \n" "vld1.u8 {q0}, [%2]! \n" - "subs %3, #4 \n" + "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" @@ -2230,7 +2229,7 @@ void ARGBInterpolateRow_NEON(uint8* dst_ptr, // Blend 100 / 0 - Copy row unchanged. "100: \n" "vld1.u8 {q0}, [%1]! \n" - "subs %3, #4 \n" + "subs %3, %3, #16 \n" "vst1.u8 {q0}, [%0]! \n" "bgt 100b \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index 6b6c88605..760b9a984 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4781,7 +4781,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, : "+r"(src_argb), // %0 "+r"(src_argb_stride_temp), // %1 "+r"(dst_argb), // %2 - "+r"(src_dudv), // %3 + "+r"(src_dudv), // %3 "+rm"(width), // %4 "+r"(temp) // %5 : @@ -4793,11 +4793,10 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated. -void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { asm volatile ( "sub %1,%0 \n" "shr %3 \n" @@ -4831,7 +4830,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" @@ -4844,7 +4843,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, "movdqa (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 25b \n" @@ -4856,7 +4855,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, "movdqa (%1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 50b \n" @@ -4869,7 +4868,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, "movdqa (%1,%4,1),%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 75b \n" @@ -4879,14 +4878,14 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, ".p2align 4 \n" "100: \n" "movdqa (%1),%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 100b \n" "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"(static_cast(src_stride)) // %4 @@ -4897,11 +4896,10 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, ); } -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated. -void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { asm volatile ( "sub %1,%0 \n" "shr %3 \n" @@ -4943,7 +4941,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" @@ -4956,7 +4954,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqa (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 25b \n" @@ -4968,7 +4966,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqa (%1),%%xmm0 \n" "movdqa (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 50b \n" @@ -4981,7 +4979,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqa (%1,%4,1),%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 75b \n" @@ -4991,14 +4989,14 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, ".p2align 4 \n" "100: \n" "movdqa (%1),%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 100b \n" "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"(static_cast(src_stride)) // %4 @@ -5009,11 +5007,10 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, ); } -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated. -void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { asm volatile ( "sub %1,%0 \n" "shr %3 \n" @@ -5047,7 +5044,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" @@ -5060,7 +5057,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, "movdqu (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 25b \n" @@ -5072,7 +5069,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, "movdqu (%1),%%xmm0 \n" "movdqu (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 50b \n" @@ -5085,7 +5082,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, "movdqu (%1,%4,1),%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 75b \n" @@ -5095,14 +5092,14 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, ".p2align 4 \n" "100: \n" "movdqu (%1),%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 100b \n" "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"(static_cast(src_stride)) // %4 @@ -5113,11 +5110,10 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, ); } -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated. -void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { asm volatile ( "sub %1,%0 \n" "shr %3 \n" @@ -5159,7 +5155,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" @@ -5172,7 +5168,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqu (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 25b \n" @@ -5184,7 +5180,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqu (%1),%%xmm0 \n" "movdqu (%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 50b \n" @@ -5197,7 +5193,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqu (%1,%4,1),%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" "pavgb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 75b \n" @@ -5207,14 +5203,14 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, ".p2align 4 \n" "100: \n" "movdqu (%1),%%xmm0 \n" - "sub $0x4,%2 \n" + "sub $0x10,%2 \n" "movdqu %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 100b \n" "99: \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"(static_cast(src_stride)) // %4 diff --git a/source/row_win.cc b/source/row_win.cc index 59a58d726..0ecd6cf49 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5923,17 +5923,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, } #endif // HAS_ARGBAFFINEROW_SSE2 -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated. +// Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) -void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5969,7 +5968,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, psrlw xmm0, 7 psrlw xmm1, 7 packuswb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop @@ -5982,7 +5981,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, movdqa xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop25 @@ -5994,7 +5993,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, movdqa xmm0, [esi] movdqa xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop50 @@ -6007,7 +6006,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, movdqa xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop75 @@ -6017,7 +6016,7 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, align 16 xloop100: movdqa xmm0, [esi] - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop100 @@ -6029,17 +6028,16 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, } } -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSE2 but without last pixel duplicated. +// Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) -void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -6081,7 +6079,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, paddw xmm0, xmm2 // sum rows paddw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop @@ -6094,7 +6092,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, movdqa xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop25 @@ -6106,7 +6104,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, movdqa xmm0, [esi] movdqa xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop50 @@ -6119,7 +6117,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, movdqa xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop75 @@ -6129,7 +6127,7 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, align 16 xloop100: movdqa xmm0, [esi] - sub ecx, 4 + sub ecx, 16 movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop100 @@ -6141,17 +6139,16 @@ void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, } } -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSSE3 but without last pixel duplicated. +// Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) -void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -6187,7 +6184,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, psrlw xmm0, 7 psrlw xmm1, 7 packuswb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop @@ -6200,7 +6197,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop25 @@ -6212,7 +6209,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop50 @@ -6225,7 +6222,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, movdqu xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop75 @@ -6235,7 +6232,7 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, align 16 xloop100: movdqu xmm0, [esi] - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop100 @@ -6247,17 +6244,16 @@ void ARGBInterpolateRow_Unaligned_SSSE3(uint8* dst_argb, const uint8* src_argb, } } -// Bilinear image filtering. -// Same as ScaleARGBFilterRows_SSE2 but without last pixel duplicated. +// Bilinear filter 16x2 -> 16x1 __declspec(naked) __declspec(align(16)) -void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_argb - mov esi, [esp + 8 + 8] // src_argb + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -6299,7 +6295,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, paddw xmm0, xmm2 // sum rows paddw xmm1, xmm3 packuswb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop @@ -6312,7 +6308,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop25 @@ -6324,7 +6320,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop50 @@ -6337,7 +6333,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, movdqu xmm0, [esi + edx] pavgb xmm0, xmm1 pavgb xmm0, xmm1 - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop75 @@ -6347,7 +6343,7 @@ void ARGBInterpolateRow_Unaligned_SSE2(uint8* dst_argb, const uint8* src_argb, align 16 xloop100: movdqu xmm0, [esi] - sub ecx, 4 + sub ecx, 16 movdqu [esi + edi], xmm0 lea esi, [esi + 16] jg xloop100 diff --git a/source/scale.cc b/source/scale.cc index 4189d3dcd..7641b07df 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -91,11 +91,6 @@ void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -// 16x2 -> 16x1 -#define HAS_SCALEFILTERROWS_NEON -void ScaleFilterRows_NEON(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction); // SSE2 downscalers with interpolation. // Constants for SSSE3 code @@ -809,350 +804,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } -// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. -// Normal formula for bilinear interpolation is: -// source_y_fraction * row1 + (1 - source_y_fraction) row0 -// SSE2 version using the a single multiply of difference: -// source_y_fraction * (row1 - row0) + row0 -// TODO(fbarchard): Specialize same as SSSE3. - -#define HAS_SCALEFILTERROWS_SSE2 -__declspec(naked) __declspec(align(16)) -static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 256. Blend 100 / 0. - cmp eax, 64 - je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. - cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. - cmp eax, 192 - je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. - - movd xmm5, eax // xmm5 = y fraction - punpcklbw xmm5, xmm5 - psrlw xmm5, 1 - punpcklwd xmm5, xmm5 - punpckldq xmm5, xmm5 - punpcklqdq xmm5, xmm5 - pxor xmm4, xmm4 - - align 16 - xloop: - movdqa xmm0, [esi] // row0 - movdqa xmm2, [esi + edx] // row1 - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - psubw xmm2, xmm0 // row1 - row0 - psubw xmm3, xmm1 - paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 - paddw xmm3, xmm3 - pmulhw xmm2, xmm5 // scale diff - pmulhw xmm3, xmm5 - paddw xmm0, xmm2 // sum rows - paddw xmm1, xmm3 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 16 - xloop25: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 16 - xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 16 - xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 16 - xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - // Extrude last pixel. - xloop99: - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret - } -} - -// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. -#define HAS_SCALEFILTERROWS_SSSE3 -__declspec(naked) __declspec(align(16)) -static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - shr eax, 1 - // Dispatch to specialized filters if applicable. - cmp eax, 0 - je xloop100 // 0 / 128. Blend 100 / 0. - cmp eax, 32 - je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. - cmp eax, 64 - je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. - cmp eax, 96 - je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. - - movd xmm0, eax // high fraction 1..127. - neg eax - add eax, 128 - movd xmm5, eax // low fraction 127..1. - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - - // General purpose row blend. - align 16 - xloop: - movdqa xmm0, [esi] - movdqa xmm2, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm5 - pmaddubsw xmm1, xmm5 - psrlw xmm0, 7 - psrlw xmm1, 7 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 16 - xloop25: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 16 - xloop50: - movdqa xmm0, [esi] - movdqa xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 16 - xloop75: - movdqa xmm1, [esi] - movdqa xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 16 - xloop100: - movdqa xmm0, [esi] - sub ecx, 16 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - // Extrude last pixel. - xloop99: - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret - } -} - -__declspec(naked) __declspec(align(16)) -static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - __asm { - push esi - push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr - mov edx, [esp + 8 + 12] // src_stride - mov ecx, [esp + 8 + 16] // dst_width - mov eax, [esp + 8 + 20] // source_y_fraction (0..255) - sub edi, esi - shr eax, 1 - cmp eax, 0 // dispatch to specialized filters if applicable. - je xloop100 - cmp eax, 32 - je xloop75 - cmp eax, 64 - je xloop50 - cmp eax, 96 - je xloop25 - - movd xmm0, eax // high fraction 1..127. - neg eax - add eax, 128 - movd xmm5, eax // low fraction 127..1. - punpcklbw xmm5, xmm0 - punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 - - // General purpose row blend. - align 16 - xloop: - movdqu xmm0, [esi] - movdqu xmm2, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 - punpckhbw xmm1, xmm2 - pmaddubsw xmm0, xmm5 - pmaddubsw xmm1, xmm5 - psrlw xmm0, 7 - psrlw xmm1, 7 - packuswb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop - jmp xloop99 - - // Blend 25 / 75. - align 16 - xloop25: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop25 - jmp xloop99 - - // Blend 50 / 50. - align 16 - xloop50: - movdqu xmm0, [esi] - movdqu xmm1, [esi + edx] - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop50 - jmp xloop99 - - // Blend 75 / 25. - align 16 - xloop75: - movdqu xmm1, [esi] - movdqu xmm0, [esi + edx] - pavgb xmm0, xmm1 - pavgb xmm0, xmm1 - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop75 - jmp xloop99 - - // Blend 100 / 0 - Copy row unchanged. - align 16 - xloop100: - movdqu xmm0, [esi] - sub ecx, 16 - movdqu [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop100 - - // Extrude last pixel. - xloop99: - punpckhbw xmm0, xmm0 - pshufhw xmm0, xmm0, 0xff - punpckhqdq xmm0, xmm0 - movdqu [esi + edi], xmm0 - pop edi - pop esi - ret - } -} #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: @@ -1745,337 +1396,6 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version -// For more info see comment above ScaleFilterRows_SSE2 for MSVC++ -#define HAS_SCALEFILTERROWS_SSE2 -static void ScaleFilterRows_SSE2(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - - // General purpose row blend. - ".p2align 4 \n" - "1: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "psubw %%xmm0,%%xmm2 \n" - "psubw %%xmm1,%%xmm3 \n" - "paddw %%xmm2,%%xmm2 \n" - "paddw %%xmm3,%%xmm3 \n" - "pmulhw %%xmm5,%%xmm2 \n" - "pmulhw %%xmm5,%%xmm3 \n" - "paddw %%xmm2,%%xmm0 \n" - "paddw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - ".p2align 4 \n" - "25: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - ".p2align 4 \n" - "50: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - ".p2align 4 \n" - "75: \n" - "movdqa (%1),%%xmm1 \n" - "movdqa (%1,%4,1),%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" - "100: \n" - "movdqa (%1),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 100b \n" - - "99: \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "pshufhw $0xff,%%xmm0,%%xmm0 \n" - "punpckhqdq %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width) // %2 - : "r"(source_y_fraction), // %3 - "r"(static_cast(src_stride)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" -#endif - ); -} - -// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version -#define HAS_SCALEFILTERROWS_SSSE3 -static void ScaleFilterRows_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - - // General purpose row blend. - ".p2align 4 \n" - "1: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - ".p2align 4 \n" - "25: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - ".p2align 4 \n" - "50: \n" - "movdqa (%1),%%xmm0 \n" - "movdqa (%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - ".p2align 4 \n" - "75: \n" - "movdqa (%1),%%xmm1 \n" - "movdqa (%1,%4,1),%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" - "100: \n" - "movdqa (%1),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 100b \n" - - // Extrude last pixel. - "99: \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "pshufhw $0xff,%%xmm0,%%xmm0 \n" - "punpckhqdq %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(static_cast(src_stride)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm5" -#endif - ); -} - -static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, - const uint8* src_ptr, - ptrdiff_t src_stride, int dst_width, - int source_y_fraction) { - asm volatile ( - "sub %1,%0 \n" - "shr %3 \n" - "cmp $0x0,%3 \n" - "je 100f \n" - "cmp $0x20,%3 \n" - "je 75f \n" - "cmp $0x40,%3 \n" - "je 50f \n" - "cmp $0x60,%3 \n" - "je 25f \n" - "movd %3,%%xmm0 \n" - "neg %3 \n" - "add $0x80,%3 \n" - "movd %3,%%xmm5 \n" - "punpcklbw %%xmm0,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm5 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" - - // General purpose row blend. - ".p2align 4 \n" - "1: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "pmaddubsw %%xmm5,%%xmm0 \n" - "pmaddubsw %%xmm5,%%xmm1 \n" - "psrlw $0x7,%%xmm0 \n" - "psrlw $0x7,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 1b \n" - "jmp 99f \n" - - // Blend 25 / 75. - ".p2align 4 \n" - "25: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 25b \n" - "jmp 99f \n" - - // Blend 50 / 50. - ".p2align 4 \n" - "50: \n" - "movdqu (%1),%%xmm0 \n" - "movdqu (%1,%4,1),%%xmm1 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 50b \n" - "jmp 99f \n" - - // Blend 75 / 25. - ".p2align 4 \n" - "75: \n" - "movdqu (%1),%%xmm1 \n" - "movdqu (%1,%4,1),%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "pavgb %%xmm1,%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 75b \n" - "jmp 99f \n" - - // Blend 100 / 0 - Copy row unchanged. - ".p2align 4 \n" - "100: \n" - "movdqu (%1),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 100b \n" - - // Extrude last pixel. - "99: \n" - "punpckhbw %%xmm0,%%xmm0 \n" - "pshufhw $0xff,%%xmm0,%%xmm0 \n" - "punpckhqdq %%xmm0,%%xmm0 \n" - "movdqu %%xmm0,(%1,%0,1) \n" - : "+r"(dst_ptr), // %0 - "+r"(src_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(source_y_fraction) // %3 - : "r"(static_cast(src_stride)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm5" -#endif - ); -} #endif // defined(__x86_64__) || defined(__i386__) #if !defined(LIBYUV_DISABLE_MIPS) && \ @@ -2085,11 +1405,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); -#define HAS_SCALEFILTERROWS_MIPS_DSPR2 -void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, - const unsigned char* src_ptr, - ptrdiff_t src_stride, - int dst_width, int source_y_fraction); #define HAS_SCALEROWDOWN4_MIPS_DSPR2 void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); @@ -2280,44 +1595,6 @@ static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, } } -#if defined(HAS_SCALEFILTERROWS_SSE2) -// Filter row to 3/4 -static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, - int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - const uint8* s = src_ptr; - uint8* dend = dst_ptr + dst_width; - do { - dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; - dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; - dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; - dst_ptr += 3; - s += 4; - } while (dst_ptr < dend); -} - -#define HAS_SCALEROWDOWN34_SSE2 -// Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Box_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - SIMD_ALIGNED(uint8 row[kMaxStride]); - ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); - ScaleFilterCols34_C(dst_ptr, row, dst_width); -} - -// Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Box_SSE2(const uint8* src_ptr, - ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - assert((dst_width % 3 == 0) && (dst_width > 0)); - SIMD_ALIGNED(uint8 row[kMaxStride]); - ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); - ScaleFilterCols34_C(dst_ptr, row, dst_width); -} -#endif - static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { assert(dst_width % 3 == 0); @@ -2376,35 +1653,6 @@ static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -// Blend 2 rows into 1 with filtering. N x 2 to N x 1 -static void ScaleFilterRows_C(uint8* dst_ptr, - const uint8* src_ptr, ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - assert(dst_width > 0); - // Specialized case for 100% first row. Helps avoid reading beyond last row. - if (source_y_fraction == 0) { - memcpy(dst_ptr, src_ptr, dst_width); - dst_ptr[dst_width] = dst_ptr[dst_width - 1]; - return; - } - int y1_fraction = source_y_fraction; - int y0_fraction = 256 - y1_fraction; - const uint8* src_ptr1 = src_ptr + src_stride; - - for (int x = 0; x < dst_width - 1; x += 2) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; - src_ptr += 2; - src_ptr1 += 2; - dst_ptr += 2; - } - if (dst_width & 1) { - dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; - dst_ptr += 1; - } - dst_ptr[0] = dst_ptr[-1]; -} - void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { assert(src_width > 0); @@ -2542,13 +1790,6 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, } } #endif -#if defined(HAS_SCALEROWDOWN34_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSE2; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSE2; - } -#endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { @@ -2825,12 +2066,59 @@ static void ScalePlaneBox(int src_width, int src_height, } } -// Scale plane to/from any dimensions, with interpolation. +// Scale plane to/from any dimensions, with bilinear interpolation. -static void ScalePlaneBilinearSimple(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +void ScalePlaneBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { + assert(dst_width > 0); + assert(dst_height > 0); + assert(Abs(src_width) <= kMaxStride); + + SIMD_ALIGNED(uint8 row[kMaxStride + 16]); + + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(src_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif int dx = 0; int dy = 0; int x = 0; @@ -2853,119 +2141,18 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height, } else if (dst_height > 1) { dy = ((src_height - 1) << 16) / (dst_height - 1); } - int maxx = (Abs(src_width) > 1) ? ((Abs(src_width) - 1) << 16) - 1 : 0; int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; - if (y > maxy) { - y = maxy; - } - for (int i = 0; i < dst_height; ++i) { - int xs = x; - int yi = y >> 16; - int yf = y & 0xffff; - const uint8* src0 = src_ptr + yi * src_stride; - const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0; - uint8* dst = dst_ptr; - for (int j = 0; j < dst_width; ++j) { - int xi = xs >> 16; - int xf = xs & 0xffff; - int x1 = (xi < (src_width - 1)) ? xi + 1 : xi; - int a = src0[xi]; - int b = src0[x1]; - int r0 = BLENDER(a, b, xf); - a = src1[xi]; - b = src1[x1]; - int r1 = BLENDER(a, b, xf); - *dst++ = BLENDER(r0, r1, yf); - xs += dx; - if (xs > maxx) - xs = maxx; + for (int j = 0; j < dst_height; ++j) { + if (y > maxy) { + y = maxy; } + int yi = y >> 16; + int yf = (y >> 8) & 255; + const uint8* src = src_ptr + yi * src_stride; + InterpolateRow(row, src, src_stride, src_width, yf); + ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); dst_ptr += dst_stride; y += dy; - if (y > maxy) - y = maxy; - } -} - - -// Scale plane to/from any dimensions, with bilinear interpolation. - -void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { - assert(dst_width > 0); - assert(dst_height > 0); - if (Abs(src_width) > kMaxStride) { - ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - - } else { - SIMD_ALIGNED(uint8 row[kMaxStride + 16]); - void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, - ptrdiff_t src_stride, - int dst_width, int source_y_fraction) = - ScaleFilterRows_C; -#if defined(HAS_SCALEFILTERROWS_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) { - ScaleFilterRows = ScaleFilterRows_NEON; - } -#endif -#if defined(HAS_SCALEFILTERROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - ScaleFilterRows = ScaleFilterRows_SSE2; - } -#endif -#if defined(HAS_SCALEFILTERROWS_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) { - ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - ScaleFilterRows = ScaleFilterRows_SSSE3; - } - } -#endif -#if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) { - ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2; - } -#endif - int dx = 0; - int dy = 0; - int x = 0; - int y = 0; - if (dst_width <= Abs(src_width)) { - dx = (Abs(src_width) << 16) / dst_width; - x = (dx >> 1) - 32768; - } else if (dst_width > 1) { - dx = ((Abs(src_width) - 1) << 16) / (dst_width - 1); - } - // Negative src_width means horizontally mirror. - if (src_width < 0) { - x += (dst_width - 1) * dx; - dx = -dx; - src_width = -src_width; - } - if (dst_height <= src_height) { - dy = (src_height << 16) / dst_height; - y = (dy >> 1) - 32768; - } else if (dst_height > 1) { - dy = ((src_height - 1) << 16) / (dst_height - 1); - } - int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; - for (int j = 0; j < dst_height; ++j) { - if (y > maxy) { - y = maxy; - } - int yi = y >> 16; - int yf = (y >> 8) & 255; - const uint8* src = src_ptr + yi * src_stride; - ScaleFilterRows(row, src, src_stride, src_width, yf); - ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); - dst_ptr += dst_stride; - y += dy; - } } } @@ -3010,11 +2197,10 @@ static void ScalePlaneAnySize(int src_width, int src_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - if (!filtering) { + if (!filtering || src_width > kMaxStride) { ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src_ptr, dst_ptr); } else { - // fall back to non-optimized version ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src_ptr, dst_ptr); } @@ -3031,7 +2217,7 @@ static void ScalePlaneDown(int src_width, int src_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - if (!filtering) { + if (!filtering || src_width > kMaxStride) { ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src_ptr, dst_ptr); } else if (filtering == kFilterBilinear || dst_height * 2 > src_height) { @@ -3099,7 +2285,7 @@ void ScalePlane(const uint8* src, int src_stride, // Scale an I420 image. // This function in turn calls a scaling function for each plane. - +// TODO(fbarchard): Disable UNDER_ALLOCATED_HACK #define UNDER_ALLOCATED_HACK 1 LIBYUV_API diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 989df55a6..3162f9e07 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -824,41 +824,51 @@ static void ScaleARGBBilinearDown(int src_height, int xr = (dx >= 0) ? xlast : x; xl = (xl >> 16) & ~3; // Left edge aligned. xr = (xr >> 16) + 1; // Right most pixel used. - int clip_src_width = ((xr - xl) + 1 + 3) & ~3; // Width aligned to 4. + int clip_src_width = (((xr - xl) + 1 + 3) & ~3) * 4; // Width aligned to 4. src_argb += xl * 4; x -= (xl << 16); - assert(clip_src_width * 4 <= kMaxStride); + assert(clip_src_width <= kMaxStride); + // TODO(fbarchard): Remove clip_src_width alignment checks. SIMD_ALIGNED(uint8 row[kMaxStride + 16]); - void (*ScaleARGBFilterRows)(uint8* dst_argb, const uint8* src_argb, + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - ARGBInterpolateRow_C; -#if defined(HAS_ARGBINTERPOLATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 4) { - ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSE2; - if (IS_ALIGNED(clip_src_width, 4)) { - ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSE2; + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { - ScaleARGBFilterRows = ARGBInterpolateRow_SSE2; + InterpolateRow = InterpolateRow_SSE2; } } } #endif -#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 4) { - ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSSE3; - if (IS_ALIGNED(clip_src_width, 4)) { - ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSSE3; +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) { - ScaleARGBFilterRows = ARGBInterpolateRow_SSSE3; + InterpolateRow = InterpolateRow_SSSE3; } } } #endif -#if defined(HAS_ARGBINTERPOLATEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 4) { - ScaleARGBFilterRows = ARGBInterpolateRow_Any_NEON; +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(clip_src_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 && + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; if (IS_ALIGNED(clip_src_width, 4)) { - ScaleARGBFilterRows = ARGBInterpolateRow_NEON; + InterpolateRow = InterpolateRow_MIPS_DSPR2; } } #endif @@ -877,7 +887,7 @@ static void ScaleARGBBilinearDown(int src_height, int yi = y >> 16; int yf = (y >> 8) & 255; const uint8* src = src_argb + yi * src_stride; - ScaleARGBFilterRows(row, src, src_stride, clip_src_width, yf); + InterpolateRow(row, src, src_stride, clip_src_width, yf); ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); dst_argb += dst_stride; y += dy; @@ -895,38 +905,44 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, assert(dst_width > 0); assert(dst_height > 0); assert(dst_width * 4 <= kMaxStride); - void (*ScaleARGBFilterRows)(uint8* dst_argb, const uint8* src_argb, + void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = - ARGBInterpolateRow_C; -#if defined(HAS_ARGBINTERPOLATEROW_SSE2) + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) { - ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSE2; + InterpolateRow = InterpolateRow_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSE2; + InterpolateRow = InterpolateRow_Unaligned_SSE2; if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleARGBFilterRows = ARGBInterpolateRow_SSE2; + InterpolateRow = InterpolateRow_SSE2; } } } #endif -#if defined(HAS_ARGBINTERPOLATEROW_SSSE3) +#if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) { - ScaleARGBFilterRows = ARGBInterpolateRow_Any_SSSE3; + InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterRows = ARGBInterpolateRow_Unaligned_SSSE3; + InterpolateRow = InterpolateRow_Unaligned_SSSE3; if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleARGBFilterRows = ARGBInterpolateRow_SSSE3; + InterpolateRow = InterpolateRow_SSSE3; } } } #endif -#if defined(HAS_ARGBINTERPOLATEROW_NEON) +#if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) { - ScaleARGBFilterRows = ARGBInterpolateRow_Any_NEON; + InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { - ScaleARGBFilterRows = ARGBInterpolateRow_NEON; + InterpolateRow = InterpolateRow_NEON; } } +#endif +#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } #endif void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) = ScaleARGBFilterCols_C; @@ -965,7 +981,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, } } int yf = (y >> 8) & 255; - ScaleARGBFilterRows(dst_argb, rowptr, rowstride, dst_width, yf); + InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); dst_argb += dst_stride; y += dy; } @@ -1024,24 +1040,23 @@ static void ScaleARGBAnySize(int src_width, int src_height, const uint8* src_argb, uint8* dst_argb, int x, int dx, int y, int dy, FilterMode filtering) { - if (!filtering || - (src_width * 4 > kMaxStride && dst_width * 4 > kMaxStride)) { - ScaleARGBSimple(src_width, src_height, clip_width, clip_height, - src_stride, dst_stride, src_argb, dst_argb, - x, dx, y, dy); - return; - } - if (dy >= 65536 || dst_width * 4 > kMaxStride) { - ScaleARGBBilinearDown(src_height, - clip_width, clip_height, - src_stride, dst_stride, src_argb, dst_argb, - x, dx, y, dy); - } else { + if (filtering && dy < 65536 && dst_width * 4 <= kMaxStride) { ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src_argb, dst_argb, x, dx, y, dy); + return; } + if (filtering && src_width * 4 < kMaxStride) { + ScaleARGBBilinearDown(src_height, + clip_width, clip_height, + src_stride, dst_stride, src_argb, dst_argb, + x, dx, y, dy); + return; + } + ScaleARGBSimple(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src_argb, dst_argb, + x, dx, y, dy); } // ScaleARGB a ARGB. diff --git a/source/scale_mips.cc b/source/scale_mips.cc index 66f2571a1..cfd48b5b0 100644 --- a/source/scale_mips.cc +++ b/source/scale_mips.cc @@ -629,64 +629,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, ); } -void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, - const unsigned char* src_ptr, - ptrdiff_t src_stride, - int dst_width, int source_y_fraction) { - int y0_fraction = 256 - source_y_fraction; - const unsigned char* src_ptr1 = src_ptr + src_stride; - - __asm__ __volatile__ ( - ".set push \n" - ".set noreorder \n" - - "replv.ph $t0, %[y0_fraction] \n" - "replv.ph $t1, %[source_y_fraction] \n" - "1: \n" - "lw $t2, 0(%[src_ptr]) \n" - "lw $t3, 0(%[src_ptr1]) \n" - "lw $t4, 4(%[src_ptr]) \n" - "lw $t5, 4(%[src_ptr1]) \n" - "muleu_s.ph.qbl $t6, $t2, $t0 \n" - "muleu_s.ph.qbr $t7, $t2, $t0 \n" - "muleu_s.ph.qbl $t8, $t3, $t1 \n" - "muleu_s.ph.qbr $t9, $t3, $t1 \n" - "muleu_s.ph.qbl $t2, $t4, $t0 \n" - "muleu_s.ph.qbr $t3, $t4, $t0 \n" - "muleu_s.ph.qbl $t4, $t5, $t1 \n" - "muleu_s.ph.qbr $t5, $t5, $t1 \n" - "addq.ph $t6, $t6, $t8 \n" - "addq.ph $t7, $t7, $t9 \n" - "addq.ph $t2, $t2, $t4 \n" - "addq.ph $t3, $t3, $t5 \n" - "shra.ph $t6, $t6, 8 \n" - "shra.ph $t7, $t7, 8 \n" - "shra.ph $t2, $t2, 8 \n" - "shra.ph $t3, $t3, 8 \n" - "precr.qb.ph $t6, $t6, $t7 \n" - "precr.qb.ph $t2, $t2, $t3 \n" - "addiu %[src_ptr], %[src_ptr], 8 \n" - "addiu %[src_ptr1], %[src_ptr1], 8 \n" - "addiu %[dst_width], %[dst_width], -8 \n" - "sw $t6, 0(%[dst_ptr]) \n" - "sw $t2, 4(%[dst_ptr]) \n" - "bgtz %[dst_width], 1b \n" - " addiu %[dst_ptr], %[dst_ptr], 8 \n" - - "lbu $t0, -1(%[dst_ptr]) \n" - "sb $t0, 0(%[dst_ptr]) \n" - ".set pop \n" - : [dst_ptr] "+r" (dst_ptr), - [src_ptr1] "+r" (src_ptr1), - [src_ptr] "+r" (src_ptr), - [dst_width] "+r" (dst_width) - : [source_y_fraction] "r" (source_y_fraction), - [y0_fraction] "r" (y0_fraction), - [src_stride] "r" (src_stride) - : "t0", "t1", "t2", "t3", "t4", "t5", - "t6", "t7", "t8", "t9" - ); -} #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) #ifdef __cplusplus diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 947531f04..4195bfa58 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -117,7 +117,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy2_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy2_Bilinear) { @@ -143,7 +143,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy1_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 0); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy1_Bilinear) { @@ -156,7 +156,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy1_Bilinear) { dst_width, dst_height, kFilterBilinear, benchmark_iterations_); - EXPECT_LE(max_diff, 0); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy4_None) { @@ -169,7 +169,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy4_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy4_Bilinear) { @@ -195,7 +195,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy5_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy5_Bilinear) { @@ -221,7 +221,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy8_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy8_Bilinear) { @@ -247,7 +247,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy16_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy16_Bilinear) { @@ -263,6 +263,32 @@ TEST_F(libyuvTest, ARGBScaleDownBy16_Bilinear) { EXPECT_LE(max_diff, 2); } +TEST_F(libyuvTest, ARGBScaleDownBy23_None) { + const int src_width = benchmark_width_; + const int src_height = benchmark_height_; + const int dst_width = Abs(src_width) * 2 / 3; + const int dst_height = Abs(src_height) * 2 / 3; + + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + kFilterNone, + benchmark_iterations_); + EXPECT_EQ(0, max_diff); +} + +TEST_F(libyuvTest, ARGBScaleDownBy23_Bilinear) { + const int src_width = benchmark_width_; + const int src_height = benchmark_height_; + const int dst_width = Abs(src_width) * 2 / 3; + const int dst_height = Abs(src_height) * 2 / 3; + + int max_diff = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + kFilterBilinear, + benchmark_iterations_); + EXPECT_LE(max_diff, 2); +} + TEST_F(libyuvTest, ARGBScaleDownBy34_None) { const int src_width = benchmark_width_; const int src_height = benchmark_height_; @@ -273,7 +299,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy34_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy34_Bilinear) { @@ -299,7 +325,7 @@ TEST_F(libyuvTest, ARGBScaleDownBy38_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleDownBy38_Bilinear) { @@ -325,7 +351,7 @@ TEST_F(libyuvTest, ARGBScaleTo1366x768_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleTo1366x768_Bilinear) { @@ -352,7 +378,7 @@ TEST_F(libyuvTest, ARGBScaleTo1280x720_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleTo1280x720_Bilinear) { @@ -378,7 +404,7 @@ TEST_F(libyuvTest, ARGBScaleTo853x480_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleTo853x480_Bilinear) { @@ -404,7 +430,7 @@ TEST_F(libyuvTest, ARGBScaleFrom640x360_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBScaleFrom640x360_Bilinear) { @@ -675,6 +701,32 @@ TEST_F(libyuvTest, ARGBScaleClipDownBy16_Bilinear) { EXPECT_EQ(0, max_diff); } +TEST_F(libyuvTest, ARGBScaleClipDownBy23_None) { + const int src_width = benchmark_width_; + const int src_height = benchmark_height_; + const int dst_width = Abs(src_width) * 2 / 3; + const int dst_height = Abs(src_height) * 2 / 3; + + int max_diff = ARGBClipTestFilter(src_width, src_height, + dst_width, dst_height, + kFilterNone, + benchmark_iterations_); + EXPECT_EQ(0, max_diff); +} + +TEST_F(libyuvTest, ARGBScaleClipDownBy23_Bilinear) { + const int src_width = benchmark_width_; + const int src_height = benchmark_height_; + const int dst_width = Abs(src_width) * 2 / 3; + const int dst_height = Abs(src_height) * 2 / 3; + + int max_diff = ARGBClipTestFilter(src_width, src_height, + dst_width, dst_height, + kFilterBilinear, + benchmark_iterations_); + EXPECT_EQ(0, max_diff); +} + TEST_F(libyuvTest, ARGBScaleClipDownBy34_None) { const int src_width = benchmark_width_; const int src_height = benchmark_height_; diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index ff718b12c..5facf7d51 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -175,7 +175,7 @@ TEST_F(libyuvTest, ScaleDownBy2_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy2_Bilinear) { @@ -214,7 +214,7 @@ TEST_F(libyuvTest, ScaleDownBy4_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 2); // This is the only scale factor with error of 2. + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy4_Bilinear) { @@ -253,7 +253,7 @@ TEST_F(libyuvTest, ScaleDownBy5_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy5_Bilinear) { @@ -292,7 +292,7 @@ TEST_F(libyuvTest, ScaleDownBy8_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy8_Bilinear) { @@ -331,7 +331,7 @@ TEST_F(libyuvTest, ScaleDownBy16_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy16_Bilinear) { @@ -344,7 +344,7 @@ TEST_F(libyuvTest, ScaleDownBy16_Bilinear) { dst_width, dst_height, kFilterBilinear, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_LE(max_diff, 2); } TEST_F(libyuvTest, ScaleDownBy16_Box) { @@ -360,6 +360,32 @@ TEST_F(libyuvTest, ScaleDownBy16_Box) { EXPECT_LE(max_diff, 1); } +TEST_F(libyuvTest, ScaleDownBy23_None) { + const int src_width = benchmark_width_; + const int src_height = benchmark_height_; + const int dst_width = Abs(src_width) * 2 / 3; + const int dst_height = Abs(src_height) * 2 / 3; + + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + kFilterNone, + benchmark_iterations_); + EXPECT_EQ(0, max_diff); +} + +TEST_F(libyuvTest, ScaleDownBy23_Bilinear) { + const int src_width = benchmark_width_; + const int src_height = benchmark_height_; + const int dst_width = Abs(src_width) * 2 / 3; + const int dst_height = Abs(src_height) * 2 / 3; + + int max_diff = TestFilter(src_width, src_height, + dst_width, dst_height, + kFilterBilinear, + benchmark_iterations_); + EXPECT_LE(max_diff, 2); +} + TEST_F(libyuvTest, ScaleDownBy34_None) { const int src_width = benchmark_width_; const int src_height = benchmark_height_; @@ -370,7 +396,7 @@ TEST_F(libyuvTest, ScaleDownBy34_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy34_Bilinear) { @@ -409,7 +435,7 @@ TEST_F(libyuvTest, ScaleDownBy38_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleDownBy38_Bilinear) { @@ -448,7 +474,7 @@ TEST_F(libyuvTest, ScaleTo1366x768_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleTo1366x768_Bilinear) { @@ -487,7 +513,7 @@ TEST_F(libyuvTest, ScaleTo1280x720_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleTo1280x720_Bilinear) { @@ -526,7 +552,7 @@ TEST_F(libyuvTest, ScaleTo853x480_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 1); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleTo853x480_Bilinear) { @@ -565,7 +591,7 @@ TEST_F(libyuvTest, ScaleFrom640x360_None) { dst_width, dst_height, kFilterNone, benchmark_iterations_); - EXPECT_LE(max_diff, 2); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ScaleFrom640x360_Bilinear) {