diff --git a/README.chromium b/README.chromium index 767780475..0936990f3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 446 +Version: 447 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 017a336cc..00b20b1de 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -86,6 +86,7 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I422TOUYVYROW_SSE2 +#define HAS_MERGEUV_SSE2 // Effects #define HAS_ARGBAFFINEROW_SSE2 @@ -120,6 +121,7 @@ extern "C" { #define HAS_UYVYTOYROW_AVX2 #define HAS_YUY2TOYROW_MMX #define HAS_UYVYTOYROW_MMX +#define HAS_MERGEUV_SSE2 #endif // The following are disabled when SSSE3 is available: @@ -311,6 +313,8 @@ void SplitUV_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MergeUV_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); +void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width); void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e5c9c063e..06aee083e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 446 +#define LIBYUV_VERSION 447 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from.cc b/source/convert_from.cc index 8e5f9ad8b..73ed900bd 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -521,7 +521,14 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, int halfwidth = (width + 1) >> 1; void (*MergeUV)(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) = MergeUV_C; -#if defined(HAS_SPLITUV_NEON) +#if defined(HAS_MERGEUV_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 16) && + IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) { + MergeUV = MergeUV_SSE2; + } +#elif defined(HAS_MERGEUV_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { MergeUV = MergeUV_NEON; } @@ -529,7 +536,7 @@ int I420ToNV12(const uint8* src_y, int src_stride_y, CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); int halfheight = (height + 1) >> 1; for (int y = 0; y < halfheight; ++y) { - // Copy a row of UV. + // Merge a row of U and V into a row of UV. MergeUV(src_u, src_v, dst_uv, halfwidth); src_u += src_stride_u; src_v += src_stride_v; diff --git a/source/row_posix.cc b/source/row_posix.cc index 267cd4b7a..4c11d4fc2 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2547,6 +2547,37 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } #endif // HAS_SPLITUV_SSE2 +#ifdef HAS_MERGEUV_SSE2 +void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + asm volatile ( + "sub %0,%1 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqa %%xmm0,(%2) \n" + "movdqa %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_MERGEUV_SSE2 + #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { asm volatile ( diff --git a/source/row_win.cc b/source/row_win.cc index 638ce05ac..d1041873e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2620,6 +2620,38 @@ void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, } #endif // HAS_SPLITUV_SSE2 +#ifdef HAS_MERGEUV_SSE2 +__declspec(naked) __declspec(align(16)) +void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_u + mov edx, [esp + 4 + 8] // src_v + mov edi, [esp + 4 + 12] // dst_uv + mov ecx, [esp + 4 + 16] // width + sub edx, eax + + align 16 + convertloop: + movdqa xmm0, [eax] // read 16 U's + movdqa xmm1, [eax + edx] // and 16 V's + lea eax, [eax + 16] + movdqa xmm2, xmm0 + punpcklbw xmm0, xmm1 // first 8 UV pairs + punpckhbw xmm2, xmm1 // next 8 UV pairs + movdqa [edi], xmm0 + movdqa [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_MERGEUV_SSE2 + #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. __declspec(naked) __declspec(align(16)) diff --git a/source/row_x86.asm b/source/row_x86.asm index ce585e446..a1982929b 100644 --- a/source/row_x86.asm +++ b/source/row_x86.asm @@ -1,12 +1,12 @@ -; +; ; Copyright 2012 The LibYuv Project Authors. All rights reserved. -; +; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. -; +; %ifdef __YASM_VERSION_ID__ %if __YASM_VERSION_ID__ < 01020000h @@ -17,6 +17,8 @@ SECTION .text +; cglobal numeric constants are parameters, gpr regs, mm regs + ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); %macro YUY2TOYROW 2-3 @@ -63,8 +65,7 @@ YUY2TOYROW YUY2,u,_Unaligned YUY2TOYROW UYVY,a, YUY2TOYROW UYVY,u,_Unaligned -; void SplitUV_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, -; int pix) { +; void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { %macro SPLITUV 1-2 cglobal SplitUV%2, 4, 4, 5, src_uv, dst_u, dst_v, pix @@ -103,3 +104,36 @@ INIT_YMM AVX2 SPLITUV a, SPLITUV u,_Unaligned +; void MergeUV_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, +; int width); + +%macro MergeUV 1-2 +cglobal MergeUV%2, 4, 4, 3, src_u, src_v, dst_uv, pix + sub src_vq, src_uq + + ALIGN 16 +.convertloop: + mov%1 m0, [src_uq] + mov%1 m1, [src_vq] + lea src_uq, [src_uq + mmsize] + mova m2, m0 + punpcklbw m0, m0, m1 // first 8 UV pairs + punpckhbw m2, m2, m1 // next 8 UV pairs + mov%1 [dst_uvq], m0 + mov%1 [dst_uvq + mmsize], m2 + lea dst_uvq, [dst_uvq + mmsize * 2] + sub pixd, mmsize + jg .convertloop + REP_RET +%endmacro + +INIT_MMX MMX +MERGEUV a, +MERGEUV u,_Unaligned +INIT_XMM SSE2 +MERGEUV a, +MERGEUV u,_Unaligned +INIT_YMM AVX2 +MERGEUV a, +MERGEUV u,_Unaligned +