diff --git a/README.chromium b/README.chromium index 006d8f533..b442aee22 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1177 +Version: 1178 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index a88c4bb43..cb5de4ca2 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1177 +#define LIBYUV_VERSION 1178 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 1929ab24c..ce63299be 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2446,8 +2446,6 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { } #endif // HAS_SPLITUVROW_SSE2 -// TODO(fbarchard): Consider vpunpcklbw, vpunpckhbw, store-low1, store-low2, -// extract-high1, extract-high2. #ifdef HAS_MERGEUVROW_AVX2 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { @@ -2458,13 +2456,12 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1 "lea " MEMLEA(0x20,0) ",%0 \n" - "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" - "vperm2i128 $0x20,%%ymm0,%%ymm2,%%ymm1 \n" - "vperm2i128 $0x31,%%ymm0,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm1," MEMACCESS(2) " \n" - "vmovdqu %%ymm2," MEMACCESS2(0x20,2) " \n" + "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n" + "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n" + "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n" + "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n" "lea " MEMLEA(0x40,2) ",%2 \n" "sub $0x20,%3 \n" "jg 1b \n" diff --git a/source/row_win.cc b/source/row_win.cc index 3cfea2158..f65822a8f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2686,10 +2686,10 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, lea eax, [eax + 32] vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 - vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 - vmovdqu [edi], ymm1 - vmovdqu [edi + 32], ymm2 + vextractf128 [edi], ymm2, 0 // bytes 0..15 + vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 + vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 + vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 lea edi, [edi + 64] sub ecx, 32 jg convertloop