mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Fix for I444ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_Unaligned_SSSE3 on Windows using movdqu instead of movdqa. break YUVTORGB into 2 macros - one to fetch pixels, another to do YUV conversion. Less duplicated source and lends itself to future YUV formats.
BUG=none TEST=WebRtcVideoFrameTest.ConvertToARGBBufferStride Review URL: https://webrtc-codereview.appspot.com/644004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@279 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
43279ffda0
commit
4c416e8849
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 277
|
Version: 279
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 277
|
#define LIBYUV_VERSION 279
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
|
|||||||
@ -1212,7 +1212,6 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
|
|||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // HAS_ARGBTOYROW_SSSE3
|
#endif // HAS_ARGBTOYROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||||
@ -1251,73 +1250,32 @@ struct {
|
|||||||
{ YG, YG, YG, YG, YG, YG, YG, YG }
|
{ YG, YG, YG, YG, YG, YG, YG, YG }
|
||||||
};
|
};
|
||||||
|
|
||||||
// Convert 8 pixels: 8 UV and 8 Y
|
// Read 8 UV from 411
|
||||||
#define YUV444TORGB \
|
#define READYUV444 \
|
||||||
"movq (%1),%%xmm0 \n" \
|
"movq (%1),%%xmm0 \n" \
|
||||||
"movq (%1,%2,1),%%xmm1 \n" \
|
"movq (%1,%2,1),%%xmm1 \n" \
|
||||||
"lea 0x8(%1),%1 \n" \
|
"lea 0x8(%1),%1 \n" \
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||||
"movdqa %%xmm0,%%xmm1 \n" \
|
|
||||||
"movdqa %%xmm0,%%xmm2 \n" \
|
|
||||||
"pmaddubsw (%5),%%xmm0 \n" \
|
|
||||||
"pmaddubsw 16(%5),%%xmm1 \n" \
|
|
||||||
"pmaddubsw 32(%5),%%xmm2 \n" \
|
|
||||||
"psubw 48(%5),%%xmm0 \n" \
|
|
||||||
"psubw 64(%5),%%xmm1 \n" \
|
|
||||||
"psubw 80(%5),%%xmm2 \n" \
|
|
||||||
"movq (%0),%%xmm3 \n" \
|
|
||||||
"lea 0x8(%0),%0 \n" \
|
|
||||||
"punpcklbw %%xmm4,%%xmm3 \n" \
|
|
||||||
"psubsw 96(%5),%%xmm3 \n" \
|
|
||||||
"pmullw 112(%5),%%xmm3 \n" \
|
|
||||||
"paddsw %%xmm3,%%xmm0 \n" \
|
|
||||||
"paddsw %%xmm3,%%xmm1 \n" \
|
|
||||||
"paddsw %%xmm3,%%xmm2 \n" \
|
|
||||||
"psraw $0x6,%%xmm0 \n" \
|
|
||||||
"psraw $0x6,%%xmm1 \n" \
|
|
||||||
"psraw $0x6,%%xmm2 \n" \
|
|
||||||
"packuswb %%xmm0,%%xmm0 \n" \
|
|
||||||
"packuswb %%xmm1,%%xmm1 \n" \
|
|
||||||
"packuswb %%xmm2,%%xmm2 \n"
|
|
||||||
|
|
||||||
// Convert 8 pixels: 4 UV and 8 Y
|
// Read 4 UV from 422, upsample to 8 UV
|
||||||
#define YUV422TORGB \
|
#define READYUV422 \
|
||||||
"movd (%1),%%xmm0 \n" \
|
"movd (%1),%%xmm0 \n" \
|
||||||
"movd (%1,%2,1),%%xmm1 \n" \
|
"movd (%1,%2,1),%%xmm1 \n" \
|
||||||
"lea 0x4(%1),%1 \n" \
|
"lea 0x4(%1),%1 \n" \
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||||
"punpcklwd %%xmm0,%%xmm0 \n" \
|
"punpcklwd %%xmm0,%%xmm0 \n" \
|
||||||
"movdqa %%xmm0,%%xmm1 \n" \
|
|
||||||
"movdqa %%xmm0,%%xmm2 \n" \
|
|
||||||
"pmaddubsw (%5),%%xmm0 \n" \
|
|
||||||
"pmaddubsw 16(%5),%%xmm1 \n" \
|
|
||||||
"pmaddubsw 32(%5),%%xmm2 \n" \
|
|
||||||
"psubw 48(%5),%%xmm0 \n" \
|
|
||||||
"psubw 64(%5),%%xmm1 \n" \
|
|
||||||
"psubw 80(%5),%%xmm2 \n" \
|
|
||||||
"movq (%0),%%xmm3 \n" \
|
|
||||||
"lea 0x8(%0),%0 \n" \
|
|
||||||
"punpcklbw %%xmm4,%%xmm3 \n" \
|
|
||||||
"psubsw 96(%5),%%xmm3 \n" \
|
|
||||||
"pmullw 112(%5),%%xmm3 \n" \
|
|
||||||
"paddsw %%xmm3,%%xmm0 \n" \
|
|
||||||
"paddsw %%xmm3,%%xmm1 \n" \
|
|
||||||
"paddsw %%xmm3,%%xmm2 \n" \
|
|
||||||
"psraw $0x6,%%xmm0 \n" \
|
|
||||||
"psraw $0x6,%%xmm1 \n" \
|
|
||||||
"psraw $0x6,%%xmm2 \n" \
|
|
||||||
"packuswb %%xmm0,%%xmm0 \n" \
|
|
||||||
"packuswb %%xmm1,%%xmm1 \n" \
|
|
||||||
"packuswb %%xmm2,%%xmm2 \n"
|
|
||||||
|
|
||||||
// Convert 8 pixels: 2 UV and 8 Y
|
// Read 2 UV from 411, upsample to 8 UV
|
||||||
#define YUV411TORGB \
|
#define READYUV411 \
|
||||||
"movd (%1),%%xmm0 \n" \
|
"movd (%1),%%xmm0 \n" \
|
||||||
"movd (%1,%2,1),%%xmm1 \n" \
|
"movd (%1,%2,1),%%xmm1 \n" \
|
||||||
"lea 0x2(%1),%1 \n" \
|
"lea 0x2(%1),%1 \n" \
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||||
"punpcklwd %%xmm0,%%xmm0 \n" \
|
"punpcklwd %%xmm0,%%xmm0 \n" \
|
||||||
"punpckldq %%xmm0,%%xmm0 \n" \
|
"punpckldq %%xmm0,%%xmm0 \n" \
|
||||||
|
|
||||||
|
// Convert 8 pixels: 8 UV and 8 Y
|
||||||
|
#define YUVTORGB \
|
||||||
"movdqa %%xmm0,%%xmm1 \n" \
|
"movdqa %%xmm0,%%xmm1 \n" \
|
||||||
"movdqa %%xmm0,%%xmm2 \n" \
|
"movdqa %%xmm0,%%xmm2 \n" \
|
||||||
"pmaddubsw (%5),%%xmm0 \n" \
|
"pmaddubsw (%5),%%xmm0 \n" \
|
||||||
@ -1352,7 +1310,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV444TORGB
|
READYUV444
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -1387,7 +1346,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -1422,7 +1382,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV411TORGB
|
READYUV411
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -1457,7 +1418,8 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV444TORGB
|
READYUV444
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -1492,7 +1454,8 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -1527,7 +1490,8 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV411TORGB
|
READYUV411
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
@ -1562,7 +1526,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||||
"punpcklbw %%xmm0,%%xmm1 \n"
|
"punpcklbw %%xmm0,%%xmm1 \n"
|
||||||
"punpcklbw %%xmm2,%%xmm5 \n"
|
"punpcklbw %%xmm2,%%xmm5 \n"
|
||||||
@ -1598,7 +1563,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm0 \n"
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
||||||
"movdqa %%xmm2,%%xmm1 \n"
|
"movdqa %%xmm2,%%xmm1 \n"
|
||||||
@ -1633,7 +1599,8 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||||
"punpcklbw %%xmm0,%%xmm1 \n"
|
"punpcklbw %%xmm0,%%xmm1 \n"
|
||||||
"punpcklbw %%xmm2,%%xmm5 \n"
|
"punpcklbw %%xmm2,%%xmm5 \n"
|
||||||
@ -1669,7 +1636,8 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
"pxor %%xmm4,%%xmm4 \n"
|
"pxor %%xmm4,%%xmm4 \n"
|
||||||
".p2align 4 \n"
|
".p2align 4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||||
"punpcklbw %%xmm5,%%xmm0 \n"
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
||||||
"movdqa %%xmm2,%%xmm1 \n"
|
"movdqa %%xmm2,%%xmm1 \n"
|
||||||
@ -1741,7 +1709,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
|||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_YTOARGBROW_SSE2
|
||||||
|
|
||||||
#ifdef HAS_MIRRORROW_SSSE3
|
#ifdef HAS_MIRRORROW_SSSE3
|
||||||
// Shuffle table for reversing the bytes.
|
// Shuffle table for reversing the bytes.
|
||||||
@ -1772,7 +1740,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
|||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_MIRRORROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_MIRRORROW_SSE2
|
#ifdef HAS_MIRRORROW_SSE2
|
||||||
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||||
@ -1803,7 +1771,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
|||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_MIRRORROW_SSE2
|
||||||
|
|
||||||
#ifdef HAS_MIRRORROW_UV_SSSE3
|
#ifdef HAS_MIRRORROW_UV_SSSE3
|
||||||
// Shuffle table for reversing the bytes of UV channels.
|
// Shuffle table for reversing the bytes of UV channels.
|
||||||
@ -1838,7 +1806,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
|||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_MIRRORROW_UV_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_ADDROW_SSE2
|
#ifdef HAS_ADDROW_SSE2
|
||||||
// dst and width aligned to 16
|
// dst and width aligned to 16
|
||||||
@ -1939,7 +1907,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
|||||||
#endif
|
#endif
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_SPLITUV_SSE2
|
||||||
|
|
||||||
#ifdef HAS_COPYROW_SSE2
|
#ifdef HAS_COPYROW_SSE2
|
||||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
||||||
@ -1979,7 +1947,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
|
|||||||
: "memory", "cc"
|
: "memory", "cc"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_COPYROW_X86
|
||||||
|
|
||||||
#ifdef HAS_YUY2TOYROW_SSE2
|
#ifdef HAS_YUY2TOYROW_SSE2
|
||||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
||||||
|
|||||||
@ -1199,6 +1199,7 @@ __asm {
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif // HAS_ARGBTOYROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||||
|
|
||||||
@ -1237,80 +1238,36 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
|||||||
|
|
||||||
// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
|
// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
|
||||||
|
|
||||||
// Convert 8 pixels: 8 UV and 8 Y
|
// Read 8 UV from 411
|
||||||
#define YUV444TORGB __asm { \
|
#define READYUV444 __asm { \
|
||||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
|
||||||
__asm movq xmm0, qword ptr [esi] /* U */ \
|
__asm movq xmm0, qword ptr [esi] /* U */ \
|
||||||
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
|
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
|
||||||
__asm lea esi, [esi + 8] \
|
__asm lea esi, [esi + 8] \
|
||||||
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
||||||
__asm movdqa xmm1, xmm0 \
|
|
||||||
__asm movdqa xmm2, xmm0 \
|
|
||||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
|
||||||
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
|
||||||
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
|
||||||
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
|
||||||
__asm psubw xmm1, kUVBiasG \
|
|
||||||
__asm psubw xmm2, kUVBiasR \
|
|
||||||
/* Step 2: Find Y contribution to 8 R,G,B values */ \
|
|
||||||
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
|
|
||||||
__asm lea eax, [eax + 8] \
|
|
||||||
__asm punpcklbw xmm3, xmm4 \
|
|
||||||
__asm psubsw xmm3, kYSub16 \
|
|
||||||
__asm pmullw xmm3, kYToRgb \
|
|
||||||
__asm paddsw xmm0, xmm3 /* B += Y */ \
|
|
||||||
__asm paddsw xmm1, xmm3 /* G += Y */ \
|
|
||||||
__asm paddsw xmm2, xmm3 /* R += Y */ \
|
|
||||||
__asm psraw xmm0, 6 \
|
|
||||||
__asm psraw xmm1, 6 \
|
|
||||||
__asm psraw xmm2, 6 \
|
|
||||||
__asm packuswb xmm0, xmm0 /* B */ \
|
|
||||||
__asm packuswb xmm1, xmm1 /* G */ \
|
|
||||||
__asm packuswb xmm2, xmm2 /* R */ \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert 8 pixels: 4 UV and 8 Y
|
// Read 4 UV from 422, upsample to 8 UV
|
||||||
#define YUV422TORGB __asm { \
|
#define READYUV422 __asm { \
|
||||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
|
||||||
__asm movd xmm0, [esi] /* U */ \
|
__asm movd xmm0, [esi] /* U */ \
|
||||||
__asm movd xmm1, [esi + edi] /* V */ \
|
__asm movd xmm1, [esi + edi] /* V */ \
|
||||||
__asm lea esi, [esi + 4] \
|
__asm lea esi, [esi + 4] \
|
||||||
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
||||||
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
||||||
__asm movdqa xmm1, xmm0 \
|
|
||||||
__asm movdqa xmm2, xmm0 \
|
|
||||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
|
||||||
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
|
||||||
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
|
||||||
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
|
||||||
__asm psubw xmm1, kUVBiasG \
|
|
||||||
__asm psubw xmm2, kUVBiasR \
|
|
||||||
/* Step 2: Find Y contribution to 8 R,G,B values */ \
|
|
||||||
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
|
|
||||||
__asm lea eax, [eax + 8] \
|
|
||||||
__asm punpcklbw xmm3, xmm4 \
|
|
||||||
__asm psubsw xmm3, kYSub16 \
|
|
||||||
__asm pmullw xmm3, kYToRgb \
|
|
||||||
__asm paddsw xmm0, xmm3 /* B += Y */ \
|
|
||||||
__asm paddsw xmm1, xmm3 /* G += Y */ \
|
|
||||||
__asm paddsw xmm2, xmm3 /* R += Y */ \
|
|
||||||
__asm psraw xmm0, 6 \
|
|
||||||
__asm psraw xmm1, 6 \
|
|
||||||
__asm psraw xmm2, 6 \
|
|
||||||
__asm packuswb xmm0, xmm0 /* B */ \
|
|
||||||
__asm packuswb xmm1, xmm1 /* G */ \
|
|
||||||
__asm packuswb xmm2, xmm2 /* R */ \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert 8 pixels: 2 UV and 8 Y
|
// Read 2 UV from 411, upsample to 8 UV
|
||||||
#define YUV411TORGB __asm { \
|
#define READYUV411 __asm { \
|
||||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
|
||||||
__asm movd xmm0, [esi] /* U */ \
|
__asm movd xmm0, [esi] /* U */ \
|
||||||
__asm movd xmm1, [esi + edi] /* V */ \
|
__asm movd xmm1, [esi + edi] /* V */ \
|
||||||
__asm lea esi, [esi + 2] \
|
__asm lea esi, [esi + 2] \
|
||||||
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
||||||
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
||||||
__asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
|
__asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert 8 pixels: 8 UV and 8 Y
|
||||||
|
#define YUVTORGB __asm { \
|
||||||
|
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
||||||
__asm movdqa xmm1, xmm0 \
|
__asm movdqa xmm1, xmm0 \
|
||||||
__asm movdqa xmm2, xmm0 \
|
__asm movdqa xmm2, xmm0 \
|
||||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
||||||
@ -1358,7 +1315,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV444TORGB
|
READYUV444
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm0, xmm1 // BG
|
punpcklbw xmm0, xmm1 // BG
|
||||||
@ -1400,7 +1358,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm0, xmm1 // BG
|
punpcklbw xmm0, xmm1 // BG
|
||||||
@ -1443,7 +1402,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV411TORGB
|
READYUV411
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm0, xmm1 // BG
|
punpcklbw xmm0, xmm1 // BG
|
||||||
@ -1485,7 +1445,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV444TORGB
|
READYUV444
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm0, xmm1 // BG
|
punpcklbw xmm0, xmm1 // BG
|
||||||
@ -1493,8 +1454,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
||||||
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
movdqa [edx + 16], xmm1
|
movdqu [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
jg convertloop
|
jg convertloop
|
||||||
@ -1527,7 +1488,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm0, xmm1 // BG
|
punpcklbw xmm0, xmm1 // BG
|
||||||
@ -1535,8 +1497,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
||||||
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
movdqa [edx + 16], xmm1
|
movdqu [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
jg convertloop
|
jg convertloop
|
||||||
@ -1570,7 +1532,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV411TORGB
|
READYUV411
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm0, xmm1 // BG
|
punpcklbw xmm0, xmm1 // BG
|
||||||
@ -1578,8 +1541,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
||||||
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
||||||
movdqa [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
movdqa [edx + 16], xmm1
|
movdqu [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
jg convertloop
|
jg convertloop
|
||||||
@ -1609,7 +1572,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into BGRA
|
// Step 3: Weave into BGRA
|
||||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||||
@ -1650,7 +1614,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm2, xmm1 // RG
|
punpcklbw xmm2, xmm1 // RG
|
||||||
@ -1689,7 +1654,8 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into BGRA
|
// Step 3: Weave into BGRA
|
||||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||||
@ -1730,7 +1696,8 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
|||||||
|
|
||||||
align 16
|
align 16
|
||||||
convertloop:
|
convertloop:
|
||||||
YUV422TORGB
|
READYUV422
|
||||||
|
YUVTORGB
|
||||||
|
|
||||||
// Step 3: Weave into ARGB
|
// Step 3: Weave into ARGB
|
||||||
punpcklbw xmm2, xmm1 // RG
|
punpcklbw xmm2, xmm1 // RG
|
||||||
@ -1796,7 +1763,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // HAS_YTOARGBROW_SSE2
|
#endif // HAS_YTOARGBROW_SSE2
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef HAS_MIRRORROW_SSSE3
|
#ifdef HAS_MIRRORROW_SSSE3
|
||||||
|
|
||||||
@ -1825,7 +1791,7 @@ __asm {
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_MIRRORROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_MIRRORROW_SSE2
|
#ifdef HAS_MIRRORROW_SSE2
|
||||||
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
|
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
|
||||||
@ -1855,7 +1821,7 @@ __asm {
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_MIRRORROW_SSE2
|
||||||
|
|
||||||
#ifdef HAS_MIRRORROW_UV_SSSE3
|
#ifdef HAS_MIRRORROW_UV_SSSE3
|
||||||
// Shuffle table for reversing the bytes of UV channels.
|
// Shuffle table for reversing the bytes of UV channels.
|
||||||
@ -1891,7 +1857,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_MIRRORROW_UV_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_ADDROW_SSE2
|
#ifdef HAS_ADDROW_SSE2
|
||||||
// dst and width aligned to 16
|
// dst and width aligned to 16
|
||||||
@ -1988,7 +1954,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_SPLITUV_SSE2
|
||||||
|
|
||||||
#ifdef HAS_COPYROW_SSE2
|
#ifdef HAS_COPYROW_SSE2
|
||||||
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
|
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
|
||||||
@ -2030,7 +1996,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
|||||||
ret
|
ret
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif // HAS_COPYROW_X86
|
||||||
|
|
||||||
#ifdef HAS_YUY2TOYROW_SSE2
|
#ifdef HAS_YUY2TOYROW_SSE2
|
||||||
__declspec(naked) __declspec(align(16))
|
__declspec(naked) __declspec(align(16))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user