From 0e6ce93c84f710e6a589c6c6edfe480ad0567f0c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 20 Jan 2012 00:42:00 +0000 Subject: [PATCH] address munge for rowreverse. And computer green mask based on red mask to save one shift. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/363001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@144 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- source/row_posix.cc | 14 ++--- source/row_win.cc | 131 +++----------------------------------------- 3 files changed, 14 insertions(+), 133 deletions(-) diff --git a/README.chromium b/README.chromium index d41c158e6..7e25f590c 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 143 +Version: 144 License: BSD License File: LICENSE diff --git a/source/row_posix.cc b/source/row_posix.cc index d57260b2a..984281be3 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -655,14 +655,13 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( "movdqa %3,%%xmm5 \n" - "lea -0x10(%0,%2,1),%0 \n" + "lea -0x10(%0),%0 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" + "movdqa (%0,%2),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -681,10 +680,9 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "lea -0x10(%0,%2,1),%0 \n" + "lea -0x10(%0),%0 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" + "movdqa (%0,%2),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "psllw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" @@ -692,9 +690,9 @@ void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) { "pshuflw $0x1b,%%xmm0,%%xmm0 \n" "pshufhw $0x1b,%%xmm0,%%xmm0 \n" "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "sub $0x10,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 diff --git a/source/row_win.cc b/source/row_win.cc index f47ea4b83..90260ef11 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -238,120 +238,6 @@ __asm { } } -#ifdef SHIFT565 -// Below shift/mask code is efficient and works, but more instructions than -// pmul method -// TODO(fbarchard): Port RGB565ToARGBRow_SSE2 to gcc -// 29 instructions -__declspec(naked) -void OldRGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_rgb565 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 for Alpha - pslld xmm5, 24 - pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red - psllw xmm4, 11 - pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue - psrlw xmm6, 11 - pcmpeqb xmm7, xmm7 // generate mask 0x00fc00fc for Green - psrlw xmm7, 10 - psllw xmm7, 2 - - convertloop: - movdqa xmm0, [eax] // fetch 8 pixels of bgr565 - lea eax, [eax + 16] - movdqa xmm1, xmm0 - movdqa xmm2, xmm0 - pand xmm1, xmm4 // R in upper 5 bits - psrlw xmm2, 13 // R 3 bits - psllw xmm2, 8 - por xmm1, xmm2 - movdqa xmm2, xmm0 - pand xmm2, xmm6 // mask B 5 bits - movdqa xmm3, xmm2 - psllw xmm2, 3 - psrlw xmm3, 2 - por xmm2, xmm3 - por xmm1, xmm2 // RB - psrlw xmm0, 3 // G in top 6 bits of lower byte - pand xmm0, xmm7 // mask G 6 bits - movdqa xmm2, xmm0 - psrlw xmm2, 6 - por xmm0, xmm2 - por xmm0, xmm5 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqa [edx], xmm1 // store 4 pixels of ARGB - movdqa [edx + 16], xmm2 // store next 4 pixels of ARGB - lea edx, [edx + 32] - sub ecx, 8 - ja convertloop - ret - } -} - -// TODO(fbarchard): Port ARGB1555ToARGBRow_SSE2 to gcc -// 33 instructions -__declspec(naked) -void OldARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, - int pix) { -__asm { - mov eax, [esp + 4] // src_argb1555 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // pix - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 for Alpha - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xf800f800 for Red - psllw xmm4, 11 - pcmpeqb xmm6, xmm6 // generate mask 0x001f001f for Blue - psrlw xmm6, 11 - pcmpeqb xmm7, xmm7 // generate mask 0x00f800f8 for Green - psrlw xmm7, 11 - psllw xmm7, 3 - - convertloop: - movdqa xmm0, [eax] // fetch 8 pixels of bgr565 - lea eax, [eax + 16] - movdqa xmm1, xmm0 - psllw xmm1, 1 - movdqa xmm2, xmm0 - pand xmm1, xmm4 // R in upper 5 bits - psrlw xmm2, 13 // R 3 bits - psllw xmm2, 8 - por xmm1, xmm2 - movdqa xmm2, xmm0 - pand xmm2, xmm6 // mask B 5 bits - movdqa xmm3, xmm2 - psllw xmm2, 3 - psrlw xmm3, 2 - por xmm2, xmm3 - por xmm1, xmm2 // RB - movdqa xmm2, xmm0 - psrlw xmm2, 2 // G in top 5 bits of lower byte - pand xmm2, xmm7 // mask G 5 bits - movdqa xmm3, xmm2 - psrlw xmm3, 5 - por xmm2, xmm3 - psraw xmm0, 8 // A - pand xmm0, xmm5 - por xmm0, xmm2 // AG - movdqa xmm2, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm2, xmm0 - movdqa [edx], xmm1 // store 4 pixels of ARGB - movdqa [edx + 16], xmm2 // store next 4 pixels of ARGB - lea edx, [edx + 32] - sub ecx, 8 - ja convertloop - ret - } -} -#endif - // pmul method to replicate bits // Math to replicate bits // (v << 8) | (v << 3) @@ -422,8 +308,7 @@ __asm { pshufd xmm6, xmm6, 0 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 - pcmpeqb xmm4, xmm4 // generate mask 0x03e003e0 for Green - psllw xmm4, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green psrlw xmm4, 6 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 @@ -1305,14 +1190,13 @@ __asm { mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, kShuffleReverse - lea eax, [eax + ecx - 16] - convertloop: - movdqa xmm0, [eax] lea eax, [eax - 16] + convertloop: + movdqa xmm0, [eax + ecx] pshufb xmm0, xmm5 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 ja convertloop ret } @@ -1327,10 +1211,9 @@ __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - lea eax, [eax + ecx - 16] - convertloop: - movdqa xmm0, [eax] lea eax, [eax - 16] + convertloop: + movdqa xmm0, [eax + ecx] movdqa xmm1, xmm0 // swap bytes psllw xmm0, 8 psrlw xmm1, 8 @@ -1338,9 +1221,9 @@ __asm { pshuflw xmm0, xmm0, 0x1b // swap words pshufhw xmm0, xmm0, 0x1b pshufd xmm0, xmm0, 0x4e // swap qwords + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 ja convertloop ret }