mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-14 22:29:52 +08:00
I400ToARGB use 8.8 fixed point to avoid a shift. gcc generate constants to avoid fpic performance stall
BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/322013 git-svn-id: http://libyuv.googlecode.com/svn/trunk@106 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
9cece4b198
commit
8b9759c4a7
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 104
|
Version: 106
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -558,47 +558,49 @@ void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
|
#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2
|
||||||
|
|
||||||
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
|
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
|
||||||
uint8* rgb_buf, // rcx
|
uint8* rgb_buf, // rcx
|
||||||
int width) { // r8
|
int width) { // r8
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||||
"pslld $0x18,%%xmm5 \n"
|
"pslld $0x18,%%xmm4 \n"
|
||||||
"pxor %%xmm4,%%xmm4 \n"
|
"mov $0x10001000,%%eax \n"
|
||||||
"movdqa %3,%%xmm3 \n"
|
"movd %%eax,%%xmm3 \n"
|
||||||
"movdqa %4,%%xmm2 \n"
|
"pshufd $0x0,%%xmm3,%%xmm3 \n"
|
||||||
|
"mov $0x012a012a,%%eax \n"
|
||||||
|
"movd %%eax,%%xmm2 \n"
|
||||||
|
"pshufd $0x0,%%xmm2,%%xmm2 \n"
|
||||||
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
||||||
"movq (%0),%%xmm0 \n"
|
"movq (%0),%%xmm0 \n"
|
||||||
"lea 0x8(%0),%0 \n"
|
"lea 0x8(%0),%0 \n"
|
||||||
"punpcklbw %%xmm4,%%xmm0 \n"
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||||
"psubsw %%xmm3,%%xmm0 \n"
|
"psubusw %%xmm3,%%xmm0 \n"
|
||||||
"pmullw %%xmm2,%%xmm0 \n"
|
"pmulhuw %%xmm2,%%xmm0 \n"
|
||||||
"psraw $0x6,%%xmm0 \n"
|
"packuswb %%xmm0,%%xmm0 \n"
|
||||||
"packuswb %%xmm0,%%xmm0 \n"
|
|
||||||
|
|
||||||
// Step 2: Weave into ARGB
|
// Step 2: Weave into ARGB
|
||||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
"punpcklwd %%xmm0,%%xmm0 \n"
|
"punpcklwd %%xmm0,%%xmm0 \n"
|
||||||
"por %%xmm5,%%xmm0 \n"
|
"punpckhwd %%xmm1,%%xmm1 \n"
|
||||||
"movdqa %%xmm0,(%1) \n"
|
"por %%xmm4,%%xmm0 \n"
|
||||||
"punpckhwd %%xmm1,%%xmm1 \n"
|
"por %%xmm4,%%xmm1 \n"
|
||||||
"por %%xmm5,%%xmm1 \n"
|
"movdqa %%xmm0,(%1) \n"
|
||||||
"movdqa %%xmm1,16(%1) \n"
|
"movdqa %%xmm1,16(%1) \n"
|
||||||
"lea 32(%1),%1 \n"
|
"lea 32(%1),%1 \n"
|
||||||
|
|
||||||
"sub $0x8,%2 \n"
|
"sub $0x8,%2 \n"
|
||||||
"ja 1b \n"
|
"ja 1b \n"
|
||||||
: "+r"(y_buf), // %0
|
: "+r"(y_buf), // %0
|
||||||
"+r"(rgb_buf), // %1
|
"+r"(rgb_buf), // %1
|
||||||
"+rm"(width) // %2
|
"+rm"(width) // %2
|
||||||
: "m"(kYuvConstants.kYSub16), // %3
|
:
|
||||||
"m"(kYuvConstants.kYToRgb) // %4
|
: "memory", "cc", "eax"
|
||||||
: "memory", "cc"
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -780,23 +780,25 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
|
|||||||
uint8* rgb_buf,
|
uint8* rgb_buf,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
|
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||||
|
pslld xmm4, 24
|
||||||
|
mov eax,0x10001000
|
||||||
|
movd xmm3,eax
|
||||||
|
pshufd xmm3,xmm3,0
|
||||||
|
mov eax,0x012a012a
|
||||||
|
movd xmm2,eax
|
||||||
|
pshufd xmm2,xmm2,0
|
||||||
mov eax, [esp + 4] // Y
|
mov eax, [esp + 4] // Y
|
||||||
mov edx, [esp + 8] // rgb
|
mov edx, [esp + 8] // rgb
|
||||||
mov ecx, [esp + 12] // width
|
mov ecx, [esp + 12] // width
|
||||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
|
||||||
pslld xmm5, 24
|
|
||||||
pxor xmm4, xmm4
|
|
||||||
movdqa xmm3, kYSub16
|
|
||||||
movdqa xmm2, kYToRgb
|
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
||||||
movq xmm0, qword ptr [eax]
|
movq xmm0, [eax]
|
||||||
lea eax, [eax + 8]
|
lea eax, [eax + 8]
|
||||||
punpcklbw xmm0, xmm4
|
punpcklbw xmm0, xmm0 // Y.Y
|
||||||
psubsw xmm0, xmm3
|
psubusw xmm0, xmm3
|
||||||
pmullw xmm0, xmm2
|
pmulhuw xmm0, xmm2
|
||||||
psraw xmm0, 6
|
|
||||||
packuswb xmm0, xmm0 // G
|
packuswb xmm0, xmm0 // G
|
||||||
|
|
||||||
// Step 2: Weave into ARGB
|
// Step 2: Weave into ARGB
|
||||||
@ -804,8 +806,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
|
|||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
|
punpcklwd xmm0, xmm0 // BGRA first 4 pixels
|
||||||
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
|
punpckhwd xmm1, xmm1 // BGRA next 4 pixels
|
||||||
por xmm0, xmm5
|
por xmm0, xmm4
|
||||||
por xmm1, xmm5
|
por xmm1, xmm4
|
||||||
movdqa [edx], xmm0
|
movdqa [edx], xmm0
|
||||||
movdqa [edx + 16], xmm1
|
movdqa [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user