mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Fix for I444ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_Unaligned_SSSE3 on Windows using movdqu instead of movdqa. break YUVTORGB into 2 macros - one to fetch pixels, another to do YUV conversion. Less duplicated source and lends itself to future YUV formats.
BUG=none TEST=WebRtcVideoFrameTest.ConvertToARGBBufferStride Review URL: https://webrtc-codereview.appspot.com/644004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@279 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
43279ffda0
commit
4c416e8849
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 277
|
||||
Version: 279
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 277
|
||||
#define LIBYUV_VERSION 279
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
|
||||
@ -1212,7 +1212,6 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
#endif // HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
@ -1251,73 +1250,32 @@ struct {
|
||||
{ YG, YG, YG, YG, YG, YG, YG, YG }
|
||||
};
|
||||
|
||||
// Convert 8 pixels: 8 UV and 8 Y
|
||||
#define YUV444TORGB \
|
||||
// Read 8 UV from 411
|
||||
#define READYUV444 \
|
||||
"movq (%1),%%xmm0 \n" \
|
||||
"movq (%1,%2,1),%%xmm1 \n" \
|
||||
"lea 0x8(%1),%1 \n" \
|
||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||
"movdqa %%xmm0,%%xmm1 \n" \
|
||||
"movdqa %%xmm0,%%xmm2 \n" \
|
||||
"pmaddubsw (%5),%%xmm0 \n" \
|
||||
"pmaddubsw 16(%5),%%xmm1 \n" \
|
||||
"pmaddubsw 32(%5),%%xmm2 \n" \
|
||||
"psubw 48(%5),%%xmm0 \n" \
|
||||
"psubw 64(%5),%%xmm1 \n" \
|
||||
"psubw 80(%5),%%xmm2 \n" \
|
||||
"movq (%0),%%xmm3 \n" \
|
||||
"lea 0x8(%0),%0 \n" \
|
||||
"punpcklbw %%xmm4,%%xmm3 \n" \
|
||||
"psubsw 96(%5),%%xmm3 \n" \
|
||||
"pmullw 112(%5),%%xmm3 \n" \
|
||||
"paddsw %%xmm3,%%xmm0 \n" \
|
||||
"paddsw %%xmm3,%%xmm1 \n" \
|
||||
"paddsw %%xmm3,%%xmm2 \n" \
|
||||
"psraw $0x6,%%xmm0 \n" \
|
||||
"psraw $0x6,%%xmm1 \n" \
|
||||
"psraw $0x6,%%xmm2 \n" \
|
||||
"packuswb %%xmm0,%%xmm0 \n" \
|
||||
"packuswb %%xmm1,%%xmm1 \n" \
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
|
||||
// Convert 8 pixels: 4 UV and 8 Y
|
||||
#define YUV422TORGB \
|
||||
// Read 4 UV from 422, upsample to 8 UV
|
||||
#define READYUV422 \
|
||||
"movd (%1),%%xmm0 \n" \
|
||||
"movd (%1,%2,1),%%xmm1 \n" \
|
||||
"lea 0x4(%1),%1 \n" \
|
||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||
"punpcklwd %%xmm0,%%xmm0 \n" \
|
||||
"movdqa %%xmm0,%%xmm1 \n" \
|
||||
"movdqa %%xmm0,%%xmm2 \n" \
|
||||
"pmaddubsw (%5),%%xmm0 \n" \
|
||||
"pmaddubsw 16(%5),%%xmm1 \n" \
|
||||
"pmaddubsw 32(%5),%%xmm2 \n" \
|
||||
"psubw 48(%5),%%xmm0 \n" \
|
||||
"psubw 64(%5),%%xmm1 \n" \
|
||||
"psubw 80(%5),%%xmm2 \n" \
|
||||
"movq (%0),%%xmm3 \n" \
|
||||
"lea 0x8(%0),%0 \n" \
|
||||
"punpcklbw %%xmm4,%%xmm3 \n" \
|
||||
"psubsw 96(%5),%%xmm3 \n" \
|
||||
"pmullw 112(%5),%%xmm3 \n" \
|
||||
"paddsw %%xmm3,%%xmm0 \n" \
|
||||
"paddsw %%xmm3,%%xmm1 \n" \
|
||||
"paddsw %%xmm3,%%xmm2 \n" \
|
||||
"psraw $0x6,%%xmm0 \n" \
|
||||
"psraw $0x6,%%xmm1 \n" \
|
||||
"psraw $0x6,%%xmm2 \n" \
|
||||
"packuswb %%xmm0,%%xmm0 \n" \
|
||||
"packuswb %%xmm1,%%xmm1 \n" \
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
|
||||
// Convert 8 pixels: 2 UV and 8 Y
|
||||
#define YUV411TORGB \
|
||||
// Read 2 UV from 411, upsample to 8 UV
|
||||
#define READYUV411 \
|
||||
"movd (%1),%%xmm0 \n" \
|
||||
"movd (%1,%2,1),%%xmm1 \n" \
|
||||
"lea 0x2(%1),%1 \n" \
|
||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||
"punpcklwd %%xmm0,%%xmm0 \n" \
|
||||
"punpckldq %%xmm0,%%xmm0 \n" \
|
||||
|
||||
// Convert 8 pixels: 8 UV and 8 Y
|
||||
#define YUVTORGB \
|
||||
"movdqa %%xmm0,%%xmm1 \n" \
|
||||
"movdqa %%xmm0,%%xmm2 \n" \
|
||||
"pmaddubsw (%5),%%xmm0 \n" \
|
||||
@ -1352,7 +1310,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV444TORGB
|
||||
READYUV444
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
@ -1387,7 +1346,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
@ -1422,7 +1382,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV411TORGB
|
||||
READYUV411
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
@ -1457,7 +1418,8 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV444TORGB
|
||||
READYUV444
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
@ -1492,7 +1454,8 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
@ -1527,7 +1490,8 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV411TORGB
|
||||
READYUV411
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
@ -1562,7 +1526,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"punpcklbw %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm5 \n"
|
||||
@ -1598,7 +1563,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"punpcklbw %%xmm5,%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm1 \n"
|
||||
@ -1633,7 +1599,8 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"punpcklbw %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm5 \n"
|
||||
@ -1669,7 +1636,8 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
"pxor %%xmm4,%%xmm4 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"punpcklbw %%xmm5,%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm1 \n"
|
||||
@ -1741,7 +1709,7 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_YTOARGBROW_SSE2
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
// Shuffle table for reversing the bytes.
|
||||
@ -1772,7 +1740,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_MIRRORROW_SSSE3
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSE2
|
||||
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
@ -1803,7 +1771,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_MIRRORROW_SSE2
|
||||
|
||||
#ifdef HAS_MIRRORROW_UV_SSSE3
|
||||
// Shuffle table for reversing the bytes of UV channels.
|
||||
@ -1838,7 +1806,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_MIRRORROW_UV_SSSE3
|
||||
|
||||
#ifdef HAS_ADDROW_SSE2
|
||||
// dst and width aligned to 16
|
||||
@ -1939,7 +1907,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_SPLITUV_SSE2
|
||||
|
||||
#ifdef HAS_COPYROW_SSE2
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
||||
@ -1979,7 +1947,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
|
||||
: "memory", "cc"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_YUY2TOYROW_SSE2
|
||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
||||
|
||||
@ -1199,6 +1199,7 @@ __asm {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
|
||||
@ -1237,80 +1238,36 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
||||
|
||||
// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
|
||||
|
||||
// Convert 8 pixels: 8 UV and 8 Y
|
||||
#define YUV444TORGB __asm { \
|
||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
||||
// Read 8 UV from 411
|
||||
#define READYUV444 __asm { \
|
||||
__asm movq xmm0, qword ptr [esi] /* U */ \
|
||||
__asm movq xmm1, qword ptr [esi + edi] /* V */ \
|
||||
__asm lea esi, [esi + 8] \
|
||||
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
||||
__asm movdqa xmm1, xmm0 \
|
||||
__asm movdqa xmm2, xmm0 \
|
||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
||||
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
||||
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
||||
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
||||
__asm psubw xmm1, kUVBiasG \
|
||||
__asm psubw xmm2, kUVBiasR \
|
||||
/* Step 2: Find Y contribution to 8 R,G,B values */ \
|
||||
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
|
||||
__asm lea eax, [eax + 8] \
|
||||
__asm punpcklbw xmm3, xmm4 \
|
||||
__asm psubsw xmm3, kYSub16 \
|
||||
__asm pmullw xmm3, kYToRgb \
|
||||
__asm paddsw xmm0, xmm3 /* B += Y */ \
|
||||
__asm paddsw xmm1, xmm3 /* G += Y */ \
|
||||
__asm paddsw xmm2, xmm3 /* R += Y */ \
|
||||
__asm psraw xmm0, 6 \
|
||||
__asm psraw xmm1, 6 \
|
||||
__asm psraw xmm2, 6 \
|
||||
__asm packuswb xmm0, xmm0 /* B */ \
|
||||
__asm packuswb xmm1, xmm1 /* G */ \
|
||||
__asm packuswb xmm2, xmm2 /* R */ \
|
||||
}
|
||||
|
||||
// Convert 8 pixels: 4 UV and 8 Y
|
||||
#define YUV422TORGB __asm { \
|
||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
||||
// Read 4 UV from 422, upsample to 8 UV
|
||||
#define READYUV422 __asm { \
|
||||
__asm movd xmm0, [esi] /* U */ \
|
||||
__asm movd xmm1, [esi + edi] /* V */ \
|
||||
__asm lea esi, [esi + 4] \
|
||||
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
||||
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
||||
__asm movdqa xmm1, xmm0 \
|
||||
__asm movdqa xmm2, xmm0 \
|
||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
||||
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
||||
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
||||
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
||||
__asm psubw xmm1, kUVBiasG \
|
||||
__asm psubw xmm2, kUVBiasR \
|
||||
/* Step 2: Find Y contribution to 8 R,G,B values */ \
|
||||
__asm movq xmm3, qword ptr [eax] /* NOLINT */ \
|
||||
__asm lea eax, [eax + 8] \
|
||||
__asm punpcklbw xmm3, xmm4 \
|
||||
__asm psubsw xmm3, kYSub16 \
|
||||
__asm pmullw xmm3, kYToRgb \
|
||||
__asm paddsw xmm0, xmm3 /* B += Y */ \
|
||||
__asm paddsw xmm1, xmm3 /* G += Y */ \
|
||||
__asm paddsw xmm2, xmm3 /* R += Y */ \
|
||||
__asm psraw xmm0, 6 \
|
||||
__asm psraw xmm1, 6 \
|
||||
__asm psraw xmm2, 6 \
|
||||
__asm packuswb xmm0, xmm0 /* B */ \
|
||||
__asm packuswb xmm1, xmm1 /* G */ \
|
||||
__asm packuswb xmm2, xmm2 /* R */ \
|
||||
}
|
||||
|
||||
// Convert 8 pixels: 2 UV and 8 Y
|
||||
#define YUV411TORGB __asm { \
|
||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
||||
// Read 2 UV from 411, upsample to 8 UV
|
||||
#define READYUV411 __asm { \
|
||||
__asm movd xmm0, [esi] /* U */ \
|
||||
__asm movd xmm1, [esi + edi] /* V */ \
|
||||
__asm lea esi, [esi + 2] \
|
||||
__asm punpcklbw xmm0, xmm1 /* UV */ \
|
||||
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
||||
__asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
|
||||
}
|
||||
|
||||
// Convert 8 pixels: 8 UV and 8 Y
|
||||
#define YUVTORGB __asm { \
|
||||
/* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
|
||||
__asm movdqa xmm1, xmm0 \
|
||||
__asm movdqa xmm2, xmm0 \
|
||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
||||
@ -1358,7 +1315,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV444TORGB
|
||||
READYUV444
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm0, xmm1 // BG
|
||||
@ -1400,7 +1358,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm0, xmm1 // BG
|
||||
@ -1443,7 +1402,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV411TORGB
|
||||
READYUV411
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm0, xmm1 // BG
|
||||
@ -1485,7 +1445,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV444TORGB
|
||||
READYUV444
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm0, xmm1 // BG
|
||||
@ -1493,8 +1454,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
||||
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
||||
movdqa [edx], xmm0
|
||||
movdqa [edx + 16], xmm1
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
@ -1527,7 +1488,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm0, xmm1 // BG
|
||||
@ -1535,8 +1497,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
||||
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
||||
movdqa [edx], xmm0
|
||||
movdqa [edx + 16], xmm1
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
@ -1570,7 +1532,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV411TORGB
|
||||
READYUV411
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm0, xmm1 // BG
|
||||
@ -1578,8 +1541,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm2 // BGRA first 4 pixels
|
||||
punpckhwd xmm1, xmm2 // BGRA next 4 pixels
|
||||
movdqa [edx], xmm0
|
||||
movdqa [edx + 16], xmm1
|
||||
movdqu [edx], xmm0
|
||||
movdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
@ -1609,7 +1572,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into BGRA
|
||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||
@ -1650,7 +1614,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm2, xmm1 // RG
|
||||
@ -1689,7 +1654,8 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into BGRA
|
||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||
@ -1730,7 +1696,8 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
YUV422TORGB
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm2, xmm1 // RG
|
||||
@ -1796,7 +1763,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
#endif // HAS_YTOARGBROW_SSE2
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
|
||||
@ -1825,7 +1791,7 @@ __asm {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_MIRRORROW_SSSE3
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSE2
|
||||
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
|
||||
@ -1855,7 +1821,7 @@ __asm {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_MIRRORROW_SSE2
|
||||
|
||||
#ifdef HAS_MIRRORROW_UV_SSSE3
|
||||
// Shuffle table for reversing the bytes of UV channels.
|
||||
@ -1891,7 +1857,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_MIRRORROW_UV_SSSE3
|
||||
|
||||
#ifdef HAS_ADDROW_SSE2
|
||||
// dst and width aligned to 16
|
||||
@ -1988,7 +1954,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_SPLITUV_SSE2
|
||||
|
||||
#ifdef HAS_COPYROW_SSE2
|
||||
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
|
||||
@ -2030,7 +1996,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_YUY2TOYROW_SSE2
|
||||
__declspec(naked) __declspec(align(16))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user