diff --git a/README.chromium b/README.chromium index 23439308d..2c4937a49 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 104 +Version: 106 License: BSD License File: LICENSE diff --git a/source/row_posix.cc b/source/row_posix.cc index fad20b2a8..eadde7818 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi #endif #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 + void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi uint8* rgb_buf, // rcx int width) { // r8 asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - "pxor %%xmm4,%%xmm4 \n" - "movdqa %3,%%xmm3 \n" - "movdqa %4,%%xmm2 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "mov $0x10001000,%%eax \n" + "movd %%eax,%%xmm3 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "mov $0x012a012a,%%eax \n" + "movd %%eax,%%xmm2 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" "1: \n" - // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "psubsw %%xmm3,%%xmm0 \n" - "pmullw %%xmm2,%%xmm0 \n" - "psraw $0x6,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "psubusw %%xmm3,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" - // Step 2: Weave into ARGB - "punpcklbw %%xmm0,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "por %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "por %%xmm5,%%xmm1 \n" - "movdqa %%xmm1,16(%1) \n" - "lea 32(%1),%1 \n" + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm4,%%xmm0 \n" + "por %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,16(%1) \n" + "lea 32(%1),%1 \n" - "sub $0x8,%2 \n" - "ja 1b \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 "+rm"(width) // %2 - : "m"(kYuvConstants.kYSub16), // %3 - "m"(kYuvConstants.kYToRgb) // %4 - : "memory", "cc" + : + : "memory", "cc", "eax" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" #endif ); } diff --git a/source/row_win.cc b/source/row_win.cc index 217985a57..9acd70764 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width) { __asm { + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + mov eax,0x10001000 + movd xmm3,eax + pshufd xmm3,xmm3,0 + mov eax,0x012a012a + movd xmm2,eax + pshufd xmm2,xmm2,0 mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - pxor xmm4, xmm4 - movdqa xmm3, kYSub16 - movdqa xmm2, kYToRgb convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - movq xmm0, qword ptr [eax] + movq xmm0, [eax] lea eax, [eax + 8] - punpcklbw xmm0, xmm4 - psubsw xmm0, xmm3 - pmullw xmm0, xmm2 - psraw xmm0, 6 + punpcklbw xmm0, xmm0 // Y.Y + psubusw xmm0, xmm3 + pmulhuw xmm0, xmm2 packuswb xmm0, xmm0 // G // Step 2: Weave into ARGB @@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 // BGRA first 4 pixels punpckhwd xmm1, xmm1 // BGRA next 4 pixels - por xmm0, xmm5 - por xmm1, xmm5 + por xmm0, xmm4 + por xmm1, xmm4 movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32]