diff --git a/README.chromium b/README.chromium index d4bfff4fd..eeedc5ca7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 263 +Version: 264 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 7b18027aa..19de9cc8b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 263 +#define LIBYUV_VERSION 264 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 3d19dae24..7731c2e4d 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -411,6 +411,92 @@ static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ); } +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +// Reads 4 pixels at a time. +// Alignment requirement: dst_ptr 16 byte aligned. +static void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + intptr_t src_stepx_x4 = static_cast(src_stepx); + intptr_t src_stepx_x12 = 0; + asm volatile ( + "lea 0x0(,%1,4),%1 \n" + "lea (%1,%1,2),%4 \n" + ".p2align 4 \n" + "1: \n" + "movd (%0),%%xmm0 \n" + "movd (%0,%1,1),%%xmm1 \n" + "punpckldq %%xmm1,%%xmm0 \n" + "movd (%0,%1,2),%%xmm2 \n" + "movd (%0,%4,1),%%xmm3 \n" + "lea (%0,%1,4),%0 \n" + "punpckldq %%xmm3,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width), // %3 + "+r"(src_stepx_x12) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_ptr 16 byte aligned. +static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + intptr_t src_stepx_x4 = static_cast(src_stepx); + intptr_t src_stepx_x12 = 0; + intptr_t row1 = static_cast(src_stride); + asm volatile ( + "lea 0x0(,%1,4),%1 \n" + "lea (%1,%1,2),%4 \n" + "lea (%0,%5,1),%5 \n" + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movhps (%0,%1,1),%%xmm0 \n" + "movq (%0,%1,2),%%xmm1 \n" + "movhps (%0,%4,1),%%xmm1 \n" + "lea (%0,%1,4),%0 \n" + "movq (%5),%%xmm2 \n" + "movhps (%5,%1,1),%%xmm2 \n" + "movq (%5,%1,2),%%xmm3 \n" + "movhps (%5,%4,1),%%xmm3 \n" + "lea (%5,%1,4),%5 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "sub $0x4,%3 \n" + "movdqa %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_stepx_x4), // %1 + "+r"(dst_ptr), // %2 + "+rm"(dst_width), // %3 + "+r"(src_stepx_x12), // %4 + "+r"(row1) // %5 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version #define HAS_SCALEARGBFILTERROWS_SSE2 static void ScaleARGBFilterRows_SSE2(uint8* dst_ptr,