diff --git a/README.chromium b/README.chromium index 00e918ffe..9f70d6007 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 784 +Version: 785 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5227e9dfa..6ad24ae15 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 784 +#define LIBYUV_VERSION 785 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index b6cacb683..a567c7d21 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -744,9 +744,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB24Row_C; #if defined(HAS_ARGBTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; @@ -792,9 +790,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRAWRow_C; #if defined(HAS_ARGBTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; diff --git a/source/row_posix.cc b/source/row_posix.cc index 9e8b1a5b4..6e8722f6a 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -569,10 +569,10 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %3,%%xmm6 \n" ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" "lea 0x40(%0),%0 \n" "pshufb %%xmm6,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" @@ -584,13 +584,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,%%xmm5 \n" "por %%xmm4,%%xmm0 \n" "pslldq $0x8,%%xmm5 \n" - "movdqa %%xmm0,(%1) \n" + "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "psrldq $0x8,%%xmm2 \n" "pslldq $0x4,%%xmm3 \n" "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm1,0x10(%1) \n" - "movdqa %%xmm2,0x20(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" @@ -610,10 +610,10 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %3,%%xmm6 \n" ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa 0x20(%0),%%xmm2 \n" - "movdqa 0x30(%0),%%xmm3 \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" "lea 0x40(%0),%0 \n" "pshufb %%xmm6,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" @@ -625,13 +625,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { "movdqa %%xmm2,%%xmm5 \n" "por %%xmm4,%%xmm0 \n" "pslldq $0x8,%%xmm5 \n" - "movdqa %%xmm0,(%1) \n" + "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "psrldq $0x8,%%xmm2 \n" "pslldq $0x4,%%xmm3 \n" "por %%xmm3,%%xmm2 \n" - "movdqa %%xmm1,0x10(%1) \n" - "movdqa %%xmm2,0x20(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" diff --git a/source/row_win.cc b/source/row_win.cc index 3e1aabd1b..1bf4a94ed 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -479,10 +479,10 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { align 16 convertloop: - movdqa xmm0, [eax] // fetch 16 pixels of argb - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] lea eax, [eax + 64] pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 @@ -494,13 +494,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { movdqa xmm5, xmm2 // 8 bytes from 2 for 1 por xmm0, xmm4 // 4 bytes from 1 for 0 pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqa [edx], xmm0 // store 0 + movdqu [edx], xmm0 // store 0 por xmm1, xmm5 // 8 bytes from 2 for 1 psrldq xmm2, 8 // 4 bytes from 2 pslldq xmm3, 4 // 12 bytes from 3 for 2 por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqa [edx + 16], xmm1 // store 1 - movdqa [edx + 32], xmm2 // store 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop @@ -518,10 +518,10 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { align 16 convertloop: - movdqa xmm0, [eax] // fetch 16 pixels of argb - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + 32] - movdqa xmm3, [eax + 48] + movdqu xmm0, [eax] // fetch 16 pixels of argb + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] lea eax, [eax + 64] pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 @@ -533,13 +533,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { movdqa xmm5, xmm2 // 8 bytes from 2 for 1 por xmm0, xmm4 // 4 bytes from 1 for 0 pslldq xmm5, 8 // 8 bytes from 2 for 1 - movdqa [edx], xmm0 // store 0 + movdqu [edx], xmm0 // store 0 por xmm1, xmm5 // 8 bytes from 2 for 1 psrldq xmm2, 8 // 4 bytes from 2 pslldq xmm3, 4 // 12 bytes from 3 for 2 por xmm2, xmm3 // 12 bytes from 3 for 2 - movdqa [edx + 16], xmm1 // store 1 - movdqa [edx + 32], xmm2 // store 2 + movdqu [edx + 16], xmm1 // store 1 + movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop