diff --git a/README.chromium b/README.chromium index e068628e2..29e4fa539 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 853 +Version: 854 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4c5f62948..437063884 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 853 +#define LIBYUV_VERSION 854 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index 689152a21..779e53a86 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -902,7 +902,7 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. - align 16 + align 4 xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx @@ -910,39 +910,38 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, movd xmm0, ebx psrlw xmm1, 9 // 7 bit fractions. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels - movd xmm7, ebx + movd xmm4, ebx pshufb xmm1, xmm5 // 0011 - punpcklwd xmm0, xmm7 + punpcklwd xmm0, xmm4 pxor xmm1, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. - psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. packuswb xmm0, xmm0 // 8 bits, 2 pixels. movd ebx, xmm0 - mov word ptr [edi], bx + mov [edi], bx lea edi, [edi + 2] sub ecx, 2 // 2 pixels jge xloop2 - align 16 + align 4 xloop29: add ecx, 2 - 1 jl xloop99 // 1 pixel remainder - movdqa xmm1, xmm2 // x0, x1 fractions. movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx - psrlw xmm1, 9 // 7 bit fractions. - pshufb xmm1, xmm5 // 0011 - pxor xmm1, xmm6 // 0..7f and 7f..0 - pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. + psrlw xmm2, 9 // 7 bit fractions. + pshufb xmm2, xmm5 // 0011 + pxor xmm2, xmm6 // 0..7f and 7f..0 + pmaddubsw xmm0, xmm2 // 16 bit psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. - packuswb xmm0, xmm0 // 8 bits, 2 pixels. + packuswb xmm0, xmm0 // 8 bits movd ebx, xmm0 - mov byte ptr [edi], bl + mov [edi], bl align 16 xloop99: @@ -1638,7 +1637,7 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "punpckldq %%xmm3,%%xmm3 \n" "paddd %%xmm3,%%xmm3 \n" "pextrw $0x3,%%xmm2,%k4 \n" - ".p2align 4 \n" + ".p2align 2 \n" "2: \n" "movdqa %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm2 \n" @@ -1646,25 +1645,24 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "movd %k5,%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" "movzwl (%1,%4,1),%k5 \n" - "movd %k5,%%xmm7 \n" + "movd %k5,%%xmm4 \n" "pshufb %%xmm5,%%xmm1 \n" - "punpcklwd %%xmm7,%%xmm0 \n" + "punpcklwd %%xmm4,%%xmm0 \n" "pxor %%xmm6,%%xmm1 \n" "pmaddubsw %%xmm1,%%xmm0 \n" - "psrlw $0x7,%%xmm0 \n" "pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x3,%%xmm2,%k4 \n" + "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movd %%xmm0,%k5 \n" "mov %w5,(%0) \n" "lea 0x2(%0),%0 \n" "sub $0x2,%2 \n" "jge 2b \n" - ".p2align 4 \n" + ".p2align 2 \n" "29: \n" "addl $0x1,%2 \n" "jl 99f \n" - "movdqa %%xmm2,%%xmm1 \n" "movzwl (%1,%3,1),%k5 \n" "movd %k5,%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" @@ -1686,7 +1684,7 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "rm"(dx) // %7 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" #endif ); } @@ -2393,13 +2391,13 @@ static void ScalePlaneBox(int src_width, int src_height, } } -// Scale plane to/from any dimensions, with bilinear interpolation. +// Scale plane down with bilinear interpolation. SAFEBUFFERS -void ScalePlaneBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterMode filtering) { +void ScalePlaneBilinearDown(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { assert(dst_width > 0); assert(dst_height > 0); assert(Abs(src_width) <= kMaxStride); @@ -2505,6 +2503,138 @@ void ScalePlaneBilinear(int src_width, int src_height, } } +// Scale up down with bilinear interpolation. +SAFEBUFFERS +void ScalePlaneBilinearUp(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(src_width != 0); + assert(src_height != 0); + assert(dst_width > 0); + assert(dst_height > 0); + assert(Abs(dst_width) <= kMaxStride); + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = + InterpolateRow_C; +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(dst_width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(dst_width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; + if (IS_ALIGNED(dst_width, 4)) { + InterpolateRow = InterpolateRow_MIPS_DSPR2; + } + } +#endif + + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) = ScaleFilterCols_C; +#if defined(HAS_SCALEFILTERCOLS_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleFilterCols = ScaleFilterCols_SSSE3; + } +#endif + int dx = 0; + int dy = 0; + int x = 0; + int y = 0; + if (dst_width <= Abs(src_width)) { + dx = FixedDiv(Abs(src_width), dst_width); + x = (dx >> 1) - 32768; + } else if (dst_width > 1) { + dx = FixedDiv(Abs(src_width) - 1, dst_width - 1); + } + // Negative src_width means horizontally mirror. + if (src_width < 0) { + x += (dst_width - 1) * dx; + dx = -dx; + src_width = -src_width; + } + if (dst_height <= src_height) { + dy = FixedDiv(src_height, dst_height); + y = (dy >> 1) - 32768; + } else if (dst_height > 1) { + dy = FixedDiv(src_height - 1, dst_height - 1); + } + + const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; + if (y > max_y) { + y = max_y; + } + int yi = y >> 16; + const uint8* src = src_ptr + yi * src_stride; + SIMD_ALIGNED(uint8 row[2 * kMaxStride]); + uint8* rowptr = row; + int rowstride = kMaxStride; + int lasty = yi; + + ScaleFilterCols(rowptr, src, dst_width, x, dx); + if (src_height > 1) { + src += src_stride; + } + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); + src += src_stride; + + for (int j = 0; j < dst_height; ++j) { + yi = y >> 16; + if (yi != lasty) { + if (y <= max_y) { + ScaleFilterCols(rowptr, src, dst_width, x, dx); + rowptr += rowstride; + rowstride = -rowstride; + lasty = yi; + src += src_stride; + } + } + if (filtering == kFilterLinear) { + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); + } else { + int yf = (y >> 8) & 255; + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); + } + dst_ptr += dst_stride; + y += dy; + } +} + // Scale plane to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and @@ -2540,40 +2670,29 @@ static void ScalePlaneSimple(int src_width, int src_height, } // Scale plane to/from any dimensions. - static void ScalePlaneAnySize(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - if (!filtering || src_width > kMaxStride) { - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else { - ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr, filtering); - } -} - -// Scale plane down, any size - -static void ScalePlaneDown(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterMode filtering) { - if (!filtering || src_width > kMaxStride) { - ScalePlaneSimple(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr); - } else if (filtering == kFilterBilinear || filtering == kFilterLinear || - dst_height * 2 > src_height) { - // between 1/2x and 1x use bilinear - ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src_ptr, dst_ptr, filtering); - } else { + if (filtering == kFilterBox && src_width <= kMaxStride && + dst_height * 2 < src_height ) { ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src_ptr, dst_ptr); + return; } + if (filtering && dst_height > src_height && dst_width <= kMaxStride) { + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr, filtering); + return; + } + if (filtering && src_width <= kMaxStride) { + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr, filtering); + return; + } + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src_ptr, dst_ptr); } // Scale a plane. @@ -2591,45 +2710,51 @@ void ScalePlane(const uint8* src, int src_stride, if (dst_width == src_width && dst_height == src_height) { // Straight copy. CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); - } else if (dst_width == src_width) { + return; + } + if (dst_width == src_width) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled vertically. ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, 0, 0, dy, 1, filtering); - } else if (dst_width <= Abs(src_width) && dst_height <= src_height) { + return; + } + if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { + return; + } + if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); + return; + } // 3/8 rounded up for odd sized chroma height. - } else if (8 * dst_width == 3 * src_width && - dst_height == ((src_height * 3 + 7) / 8)) { + if (8 * dst_width == 3 * src_width && + dst_height == ((src_height * 3 + 7) / 8)) { // optimized, 3/8 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == src_width && 4 * dst_height == src_height && + return; + } + if (4 * dst_width == src_width && 4 * dst_height == src_height && filtering != kFilterBilinear) { // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else { - // Arbitrary downsample - ScalePlaneDown(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); + return; } - } else { - // Arbitrary scale up and/or down. - ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); } + // Arbitrary scale up and/or down. + ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); } // Scale an I420 image. diff --git a/source/scale_argb.cc b/source/scale_argb.cc index f00dde26e..2c9fb615e 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -353,7 +353,7 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. - align 16 + align 4 xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx @@ -364,16 +364,16 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, pshufb xmm0, xmm4 // arrange pixels into pairs pxor xmm1, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. - psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. + psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. movq qword ptr [edi], xmm0 lea edi, [edi + 8] sub ecx, 2 // 2 pixels jge xloop2 - align 16 + align 4 xloop29: add ecx, 2 - 1 @@ -805,7 +805,7 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, "paddd %%xmm3,%%xmm3 \n" "pextrw $0x3,%%xmm2,%k4 \n" - ".p2align 4 \n" + ".p2align 2 \n" BUNDLEALIGN "2: \n" "movdqa %%xmm2,%%xmm1 \n" @@ -827,7 +827,7 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, "sub $0x2,%2 \n" "jge 2b \n" - ".p2align 4 \n" + ".p2align 2 \n" BUNDLEALIGN "29: \n" "add $0x1,%2 \n"