From 5822505e0addcf81bb3b9ea8fce294948de5c95d Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 17 Nov 2014 18:33:07 +0000 Subject: [PATCH] Remove extra unaligned loop from alphablender. Both aligned and unaligned loops were the same, so remove the extra. BUG=none TESTED=try bots. R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/29059004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1166 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 2 +- include/libyuv/version.h | 2 +- source/row_posix.cc | 34 +--------------------------------- source/row_win.cc | 38 +++----------------------------------- 5 files changed, 7 insertions(+), 71 deletions(-) diff --git a/README.chromium b/README.chromium index 10e16c439..30cc4ed6b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1164 +Version: 1165 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ffd837030..06ebf6f5b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -207,12 +207,12 @@ extern "C" { #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 -#define HAS_ARGBMIRRORROW_AVX2 #endif // The following are require VS2012. // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 482a3118f..ca4d6ccb6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1164 +#define LIBYUV_VERSION 1165 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 18927600b..0d2fa7f82 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2183,10 +2183,9 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile ( "movdqa %3,%%xmm5 \n" - "lea " MEMLEA(-0x10,0) ",%0 \n" LABELALIGN "1: \n" - MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 + MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 "pshufb %%xmm5,%%xmm0 \n" "sub $0x10,%2 \n" "movdqu %%xmm0," MEMACCESS(1) " \n" @@ -3378,10 +3377,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "19: \n" "add $1-4,%3 \n" "jl 49f \n" - "test $0xf,%0 \n" - "jne 41f \n" - "test $0xf,%1 \n" - "jne 41f \n" // 4 pixel loop. LABELALIGN @@ -3408,33 +3403,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "movdqu %%xmm0," MEMACCESS(2) " \n" "lea " MEMLEA(0x10,2) ",%2 \n" "jge 40b \n" - "jmp 49f \n" - - // 4 pixel loop. - LABELALIGN - "41: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "sub $0x4,%3 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "jge 41b \n" "49: \n" "add $0x3,%3 \n" diff --git a/source/row_win.cc b/source/row_win.cc index c8d29a043..444791491 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2392,7 +2392,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, } #endif // HAS_YTOARGBROW_SSE2 - #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = { @@ -2432,7 +2431,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { align 4 convertloop: - vmovdqu ymm0, [eax - 32 + ecx] + vmovdqu ymm0, -32[eax + ecx] vpshufb ymm0, ymm0, ymm5 vpermq ymm0, ymm0, 0x4e // swap high and low halfs sub ecx, 32 @@ -2455,7 +2454,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { align 4 convertloop: - movdqu xmm0, [eax - 16 + ecx] + movdqu xmm0, -16[eax + ecx] movdqa xmm1, xmm0 // swap bytes psllw xmm0, 8 psrlw xmm1, 8 @@ -2553,7 +2552,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { align 4 convertloop: - vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order + vpermd ymm0, ymm5, -32[eax + ecx * 4] // permute dword order sub ecx, 8 vmovdqu [edx], ymm0 lea edx, [edx + 32] @@ -3608,11 +3607,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, add ecx, 1 - 4 jl convertloop4b - test eax, 15 // unaligned? - jne convertuloop4 - test esi, 15 // unaligned? - jne convertuloop4 - // 4 pixel loop. convertloop4: movdqu xmm3, [eax] // src argb @@ -3637,32 +3631,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, movdqu [edx], xmm0 lea edx, [edx + 16] jge convertloop4 - jmp convertloop4b - - // 4 pixel unaligned loop. - convertuloop4: - movdqu xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - pshufb xmm3, kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - sub ecx, 4 - movdqu [edx], xmm0 - lea edx, [edx + 16] - jge convertuloop4 convertloop4b: add ecx, 4 - 1