diff --git a/README.chromium b/README.chromium index 74f81d9ce..c8ec79fda 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 209 +Version: 210 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index a1e2f0a62..eea35e42f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 209 +#define LIBYUV_VERSION 210 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_neon.cc b/source/row_neon.cc index 00b2aa6ab..fb4205a79 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -218,10 +218,10 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // loop will run one extra time. "sub %2, #16 \n" - // mirror the bytes in the 64 bit segments. unable to mirror + // mirror the bytes in the 64 bit segments. unable to mirror // the bytes in the entire 128 bits in one go. // because of the inability to mirror the entire 128 bits - // mirror the writing out of the two 64 bit segments. + // mirror the writing out of the two 64 bit segments. "1: \n" "vld1.8 {q0}, [%0]! \n" // src += 16 "vrev64.8 q0, q0 \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index e4533e33e..12226db31 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1931,29 +1931,29 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "sub %0,%1 \n" "mov (%0),%3 \n" "sub $0x1,%2 \n" - "je 8f \n" // last1 + "jle 8f \n" // last1 "cmp $0xff000000,%3 \n" "jae 2f \n" // opaqueloop "cmp $0xffffff,%3 \n" - "ja 3f \n" // translucientloop + "ja 3f \n" // translucentloop // transparentloop "1: \n" "sub $0x1,%2 \n" "lea 0x4(%0),%0 \n" - "je 8f \n" // last1 + "jle 8f \n" // last1 "mov (%0),%3 \n" "cmp $0xffffff,%3 \n" "jbe 1b \n" // transparentloop "cmp $0xff000000,%3 \n" - "jb 3f \n" // translucientloop + "jb 3f \n" // translucentloop // opaqueloop "2: \n" "mov %3,(%0,%1,1) \n" "lea 0x4(%0),%0 \n" "sub $0x1,%2 \n" - "je 8f \n" // last1 + "jle 8f \n" // last1 "mov (%0),%3 \n" "cmp $0xff000000,%3 \n" "jae 2b \n" // opaqueloop @@ -1961,48 +1961,50 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "jbe 1b \n" // transparentloop "nop \n" - // translucientloop + // translucentloop "3: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%1,1),%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pshuflw $0xff,%%xmm0,%%xmm2 \n" - "pshufhw $0xff,%%xmm2,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "pxor %%xmm4,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "paddw %%xmm1,%%xmm0 \n" - "psrlw $0x8,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%0,%1,1) \n" - "lea 0x8(%0),%0 \n" - "sub $0x2,%2 \n" - "jbe 8f \n" // last1 - "mov (%0),%3 \n" - "cmp $0xffffff,%3 \n" - "jbe 1b \n" // transparentloop - "cmp $0xff000000,%3 \n" - "jb 3b \n" // translucientloop - "jmp 2b \n" // opaqueloop - - // last1 - "8: \n" - "add $0x1,%2 \n" - "je 9f \n" // done "movd %3,%%xmm0 \n" "mov (%0,%1,1),%3 \n" "movd %3,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm1,%%xmm1 \n" "pshuflw $0xff,%%xmm0,%%xmm2 \n" - "pshufhw $0xff,%%xmm2,%%xmm2 \n" "movdqa %%xmm2,%%xmm3 \n" "pxor %%xmm4,%%xmm3 \n" "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm3,%%xmm1 \n" - "paddw %%xmm1,%%xmm0 \n" + "paddusw %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,%3 \n" + "mov %3,(%0,%1,1) \n" + "lea 0x8(%0),%0 \n" + "sub $0x2,%2 \n" + "jle 8f \n" // last1 + "mov (%0),%3 \n" + "cmp $0xffffff,%3 \n" + "jbe 1b \n" // transparentloop + "cmp $0xff000000,%3 \n" + "jb 3b \n" // translucentloop + "jmp 2b \n" // opaqueloop + + // last1 + "8: \n" + "add $0x1,%2 \n" // 1 pixel left? + "cmp $0x1,%2 \n" + "jl 9f \n" // done + "mov (%0),%3 \n" + "movd %3,%%xmm0 \n" + "mov (%0,%1,1),%3 \n" + "movd %3,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pshuflw $0xff,%%xmm0,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "pxor %%xmm4,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "paddusw %%xmm1,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movd %%xmm0,%3 \n" diff --git a/source/row_win.cc b/source/row_win.cc index ed6b073c3..5bf422069 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -477,7 +477,6 @@ __asm { } } -// TODO(fbarchard): Port to gcc __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { @@ -1965,40 +1964,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { mov edx, [esp + 4 + 8] // dst_argb mov ecx, [esp + 4 + 12] // width pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha + pcmpeqb xmm5, xmm5 // generate 0xff000000 for alpha + pslld xmm5, 24 sub edx, esi mov eax, [esi] // get first pixel sub ecx, 1 // ensure there are at least 2 pixels - je last1 // last pixel? + jle last1 // last pixel? cmp eax, 0xFF000000 // opaque? jae opaqueloop - cmp eax, 0x00FFFFFF // translucient? - ja translucientloop + cmp eax, 0x00FFFFFF // translucent? + ja translucentloop align 16 transparentloop: sub ecx, 1 lea esi, [esi + 4] - je last1 - mov eax, [esi] // handle remaining pixel + jle last1 + mov eax, [esi] // get next pixel cmp eax, 0x00FFFFFF // transparent? jbe transparentloop - cmp eax, 0xFF000000 // translucient? - jb translucientloop + cmp eax, 0xFF000000 // translucent? + jb translucentloop align 16 opaqueloop: mov dword ptr [esi + edx], eax lea esi, [esi + 4] sub ecx, 1 - je last1 - mov eax, [esi] // handle remaining pixel + jle last1 + mov eax, [esi] // get next pixel cmp eax, 0xFF000000 // opaque? jae opaqueloop cmp eax, 0x00FFFFFF // transparent? jbe transparentloop align 16 - translucientloop: + translucentloop: movq xmm0, qword ptr [esi] // fetch 2 pixels movq xmm1, qword ptr [esi + edx] punpcklbw xmm0, xmm0 // src 16 bits @@ -2009,39 +2010,42 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pxor xmm3, xmm4 pmulhuw xmm0, xmm2 // src * a pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) - paddw xmm0, xmm1 + paddusw xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 // pack 2 pixels + por xmm0, xmm5 // set alpha movq qword ptr [esi + edx], xmm0 lea esi, [esi + 8] sub ecx, 2 - jbe last1 - mov eax, [esi] // handle remaining pixel + jle last1 + mov eax, [esi] cmp eax, 0x00FFFFFF // transparent? jbe transparentloop - cmp eax, 0xFF000000 // translucient? - jb translucientloop + cmp eax, 0xFF000000 // translucent? + jb translucentloop jmp opaqueloop align 16 last1: add ecx, 1 - je done + cmp ecx, 1 // 1 left? + jl done + mov eax, [esi] // get next pixel movd xmm0, eax mov eax, [esi + edx] movd xmm1, eax punpcklbw xmm0, xmm0 // src 16 bits punpcklbw xmm1, xmm1 // dst 16 bits pshuflw xmm2, xmm0, 0xff // src alpha - pshufhw xmm2, xmm2, 0xff movdqa xmm3, xmm2 // dst alpha pxor xmm3, xmm4 pmulhuw xmm0, xmm2 // src * a pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) - paddw xmm0, xmm1 + paddusw xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 // pack to bytes + por xmm0, xmm5 // set alpha movd eax, xmm0 mov dword ptr [esi + edx], eax