diff --git a/README.chromium b/README.chromium index 6a53f8cff..bea40118b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 548 +Version: 549 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9dace10ae..636a3cc15 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -121,6 +121,7 @@ extern "C" { #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_ARGBTOUV444ROW_SSSE3 +#define HAS_ARGBINTERPOLATEROW_SSE2 #endif // The following are Yasm x86 only. @@ -1306,6 +1307,9 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, void ARGBInterpolateRow_C(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride_argb, int dst_width, int source_y_fraction); +void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride_argb, int dst_width, + int source_y_fraction); void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride_argb, int dst_width, int source_y_fraction); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 73871022f..0699da8d4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 548 +#define LIBYUV_VERSION 549 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 80143e9a2..c24c2d307 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1284,6 +1284,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, void (*ARGBInterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = ARGBInterpolateRow_C; +#if defined(HAS_ARGBINTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && + IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBInterpolateRow = ARGBInterpolateRow_SSE2; + } +#endif #if defined(HAS_ARGBINTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) && diff --git a/source/row_win.cc b/source/row_win.cc index c66c1ce92..c1b8a7d34 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4724,7 +4724,117 @@ void ARGBInterpolateRow_SSSE3(uint8* dst_argb, const uint8* src_argb, lea esi, [esi + 16] jg xloop100 - // Extrude last pixel. + xloop99: + pop edi + pop esi + ret + } +} + +// Bilinear image filtering. +// Same as ScaleARGBFilterRows_SSE2 but without last pixel duplicated. +__declspec(naked) __declspec(align(16)) +void ARGBInterpolateRow_SSE2(uint8* dst_argb, const uint8* src_argb, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) { + __asm { + push esi + push edi + mov edi, [esp + 8 + 4] // dst_argb + mov esi, [esp + 8 + 8] // src_argb + mov edx, [esp + 8 + 12] // src_stride + mov ecx, [esp + 8 + 16] // dst_width + mov eax, [esp + 8 + 20] // source_y_fraction (0..255) + sub edi, esi + cmp eax, 0 // dispatch to specialized filters if applicable. + je xloop100 + cmp eax, 64 + je xloop75 + cmp eax, 128 + je xloop50 + cmp eax, 192 + je xloop25 + + movd xmm5, eax // xmm5 = y fraction + punpcklbw xmm5, xmm5 + psrlw xmm5, 1 + punpcklwd xmm5, xmm5 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 + pxor xmm4, xmm4 + + align 16 + xloop: + movdqa xmm0, [esi] // row0 + movdqa xmm2, [esi + edx] // row1 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + psubw xmm2, xmm0 // row1 - row0 + psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 + pmulhw xmm2, xmm5 // scale diff + pmulhw xmm3, xmm5 + paddw xmm0, xmm2 // sum rows + paddw xmm1, xmm3 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop + jmp xloop99 + + // Blend 25 / 75. + align 16 + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 16 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 16 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 16 + xloop100: + movdqa xmm0, [esi] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + xloop99: pop edi pop esi diff --git a/source/scale.cc b/source/scale.cc index f9543003c..a72a8e633 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -2483,7 +2483,7 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, } while (dst_ptr < dend); } -#define HAS_SCALEROWDOWN34_SSE2_DISABLED +#define HAS_SCALEROWDOWN34_SSE2 // Filter rows 0 and 1 together, 3 : 1 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,