From 1702ec78f85cc484e10eeac501971f76ab173b83 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 5 Apr 2012 01:15:12 +0000 Subject: [PATCH] use movdqu on 2nd source for blend BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/479001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@235 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_posix.cc | 8 ++++---- source/row_win.cc | 30 ++++++++++++++++-------------- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/README.chromium b/README.chromium index 3af24f5a3..dc95dc2ce 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 234 +Version: 235 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index cdae68054..b1a23d415 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 234 +#define LIBYUV_VERSION 235 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_posix.cc b/source/row_posix.cc index f8979ace0..14a3a0bf9 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2048,14 +2048,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, "movdqu (%0),%%xmm3 \n" // first 4 pixels "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" - "movdqa (%1),%%xmm2 \n" + "movdqu (%1),%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqa (%1),%%xmm1 \n" + "movdqu (%1),%%xmm1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" @@ -2070,14 +2070,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, "jle 9f \n" "movdqa %%xmm3,%%xmm0 \n" // next 4 pixels "pxor %%xmm4,%%xmm3 \n" - "movdqa 0x10(%1),%%xmm2 \n" + "movdqu 0x10(%1),%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "pshufhw $0xf5,%%xmm3,%%xmm3 \n" "pshuflw $0xf5,%%xmm3,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" - "movdqa 0x10(%1),%%xmm1 \n" + "movdqu 0x10(%1),%%xmm1 \n" "lea 0x20(%1),%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" diff --git a/source/row_win.cc b/source/row_win.cc index c7c553774..472780565 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2075,11 +2075,13 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time -// Destination aligned to 16 bytes, multiple of 4 pixels +// Blend 8 pixels at a time. +// src_argb0 unaligned. +// src_argb1 and dst_argb aligned to 16 bytes. +// width must be multiple of 4 pixels. __declspec(naked) __declspec(align(16)) void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb0 @@ -2100,14 +2102,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, movdqu xmm3, [eax] movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi] // _r_b + movdqu xmm2, [esi] // _r_b psrlw xmm3, 8 // alpha pshufhw xmm3, xmm3,0F5h // 8 alpha words pshuflw xmm3, xmm3,0F5h pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi] // _a_g + movdqu xmm1, [esi] // _a_g psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 pmullw xmm1, xmm3 // _a_g * alpha @@ -2123,14 +2125,14 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi + 16] // _r_b + movdqu xmm2, [esi + 16] // _r_b psrlw xmm3, 8 // alpha pshufhw xmm3, xmm3,0F5h // 8 alpha words pshuflw xmm3, xmm3,0F5h pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi + 16] // _a_g + movdqu xmm1, [esi + 16] // _a_g lea esi, [esi + 32] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 @@ -2150,7 +2152,7 @@ void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1, } } -// Blend 1 pixel at a time, unaligned +// Blend 1 pixel at a time, unaligned. __declspec(naked) __declspec(align(16)) void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { @@ -2247,7 +2249,7 @@ static const uvec8 kShuffleAlpha = { // with.. // pshufb xmm3, kShuffleAlpha // alpha -// Destination aligned to 16 bytes, multiple of 4 pixels +// Destination aligned to 16 bytes, multiple of 4 pixels. __declspec(naked) __declspec(align(16)) void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { @@ -2272,11 +2274,11 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha pshufb xmm3, kShuffleAlpha // alpha - movdqa xmm2, [esi] // _r_b + movdqu xmm2, [esi] // _r_b pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi] // _a_g + movdqu xmm1, [esi] // _a_g psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 pmullw xmm1, xmm3 // _a_g * alpha @@ -2292,12 +2294,12 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha - movdqa xmm2, [esi + 16] // _r_b + movdqu xmm2, [esi + 16] // _r_b pshufb xmm3, kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha - movdqa xmm1, [esi + 16] // _a_g + movdqu xmm1, [esi + 16] // _a_g lea esi, [esi + 32] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 @@ -2331,7 +2333,7 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, dst_argb += count * 4; width -= count; } - // Do multiple of 4 pixels + // Do multiple of 4 pixels. if (width & ~3) { ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3); }