diff --git a/README.chromium b/README.chromium index 96c4ad890..ac6bf7628 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 567 +Version: 568 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ce6f16bfb..4802bf604 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -128,6 +128,7 @@ extern "C" { // TODO(fbarchard): Hook these up to all functions. e.g. format conversion. #define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 +#define HAS_SPLITUVROW_AVX2 #endif #endif diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 9b1b24ed1..49030f107 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 567 +#define LIBYUV_VERSION 568 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_win.cc b/source/row_win.cc index b9e4a8488..cbb5e6c05 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3238,6 +3238,81 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, } } +#ifdef HAS_SPLITUVROW_AVX2 +__declspec(naked) __declspec(align(16)) +void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 16 + convertloop: + vmovdqa ymm0, [eax] + vmovdqa ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqa [edx], ymm0 + vmovdqa [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void SplitUVRow_Unaligned_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_uv + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff + vpsrlw ymm5, ymm5, 8 + sub edi, edx + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm2, ymm0, 8 // odd bytes + vpsrlw ymm3, ymm1, 8 + vpand ymm0, ymm0, ymm5 // even bytes + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpackuswb ymm2, ymm2, ymm3 + vpermq ymm0, ymm0, 0xd8 + vpermq ymm2, ymm2, 0xd8 + vmovdqu [edx], ymm0 + vmovdqu [edx + edi], ymm2 + lea edx, [edx + 32] + sub ecx, 32 + jg convertloop + + pop edi + ret + } +} +#endif // HAS_SPLITUVROW_AVX2 + __declspec(naked) __declspec(align(16)) void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { diff --git a/source/row_x86.asm b/source/row_x86.asm index c537b9f8a..8deb7f749 100644 --- a/source/row_x86.asm +++ b/source/row_x86.asm @@ -81,18 +81,14 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix mov%1 m0, [src_uvq] mov%1 m1, [src_uvq + mmsize] lea src_uvq, [src_uvq + mmsize * 2] - mova m2, m0 - mova m3, m1 + psrlw m2, m0, 8 ; odd bytes + psrlw m3, m1, 8 pand m0, m0, m4 ; even bytes pand m1, m1, m4 packuswb m0, m0, m1 -%if cpuflag(AVX2) - vpermq m0, m0, 0xd8 -%endif - psrlw m2, m2, 8 ; odd bytes - psrlw m3, m3, 8 packuswb m2, m2, m3 %if cpuflag(AVX2) + vpermq m0, m0, 0xd8 vpermq m2, m2, 0xd8 %endif mov%1 [dst_uq], m0