mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
RGBAToI420_SSSE3 and I420ToRGBA_SSSE3 implemented.
BUG=78 TESTED=gcl lint Review URL: https://webrtc-codereview.appspot.com/796009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@359 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
13f3894033
commit
25dc05858e
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 358
|
||||
Version: 359
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 358
|
||||
#define LIBYUV_VERSION 359
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -37,9 +37,6 @@ extern "C" {
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
#define HAS_ABGRTOUVROW_SSSE3
|
||||
#define HAS_ABGRTOYROW_SSSE3
|
||||
// TODO(FBARCHARD): #define HAS_RGBATOARGBROW_SSSE3
|
||||
// TODO(FBARCHARD): #define HAS_RGBATOUVROW_SSSE3
|
||||
// TODO(FBARCHARD): #define HAS_RGBATOYROW_SSSE3
|
||||
#define HAS_ARGBTORGBAROW_SSSE3
|
||||
#define HAS_ARGB1555TOARGBROW_SSE2
|
||||
#define HAS_ARGB4444TOARGBROW_SSE2
|
||||
@ -61,7 +58,6 @@ extern "C" {
|
||||
#define HAS_I422TOARGBROW_SSSE3
|
||||
#define HAS_I422TOBGRAROW_SSSE3
|
||||
#define HAS_I422TOABGRROW_SSSE3
|
||||
// TODO(FBARCHARD): #define HAS_I422TORGBAROW_SSSE3
|
||||
#define HAS_I444TOARGBROW_SSSE3
|
||||
#define HAS_I411TOARGBROW_SSSE3
|
||||
#define HAS_I400TOARGBROW_SSE2
|
||||
@ -91,10 +87,13 @@ extern "C" {
|
||||
|
||||
// The following are Windows only:
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
// TODO(fbarchard): Investigate possible issue in this function and reenable.
|
||||
#define HAS_ARGBCOLORTABLEROW_X86
|
||||
#define HAS_NV12TOARGBROW_SSSE3
|
||||
#define HAS_NV21TOARGBROW_SSSE3
|
||||
#define HAS_I422TORGBAROW_SSSE3
|
||||
#define HAS_RGBATOARGBROW_SSSE3
|
||||
#define HAS_RGBATOUVROW_SSSE3
|
||||
#define HAS_RGBATOYROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are disabled when SSSE3 is available:
|
||||
|
||||
@ -941,7 +941,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
}
|
||||
|
||||
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3)
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0)
|
||||
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1)
|
||||
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2)
|
||||
@ -949,13 +949,17 @@ Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0)
|
||||
Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0)
|
||||
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1)
|
||||
YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1)
|
||||
// TODO(fbarchard): NOLINT YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
#ifdef HAS_I422TORGBAROW_SSSE3
|
||||
YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGBROW_NEON
|
||||
YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1)
|
||||
YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1)
|
||||
YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1)
|
||||
// TODO(fbarchard): NOLINT YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
|
||||
#endif
|
||||
#ifdef HAS_I422TORGBAROW_NEON
|
||||
YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1)
|
||||
#endif
|
||||
#undef YANY
|
||||
|
||||
@ -987,7 +991,9 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2)
|
||||
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4)
|
||||
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4)
|
||||
YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4)
|
||||
// TODO(fbarchard): YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
|
||||
#ifdef HAS_RGBATOYROW_SSSE3
|
||||
YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4)
|
||||
#endif
|
||||
YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2)
|
||||
YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
|
||||
#undef YANY
|
||||
@ -1006,7 +1012,9 @@ YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2)
|
||||
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
|
||||
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
|
||||
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
|
||||
// TODO(fbarchard): NOLINT UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
|
||||
#ifdef HAS_RGBATOYROW_SSSE3
|
||||
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
|
||||
#endif
|
||||
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
|
||||
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
|
||||
#undef UVANY
|
||||
|
||||
@ -59,6 +59,19 @@ static const vec8 kABGRToV = {
|
||||
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
|
||||
};
|
||||
|
||||
// Constants for RGBA.
|
||||
static const vec8 kRGBAToY = {
|
||||
0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
|
||||
};
|
||||
|
||||
static const vec8 kRGBAToU = {
|
||||
0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
|
||||
};
|
||||
|
||||
static const vec8 kRGBAToV = {
|
||||
0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
|
||||
};
|
||||
|
||||
static const uvec8 kAddY16 = {
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
||||
};
|
||||
@ -78,16 +91,16 @@ static const uvec8 kShuffleMaskRAWToARGB = {
|
||||
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting ABGR to ARGB.
|
||||
static const uvec8 kShuffleMaskABGRToARGB = {
|
||||
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting BGRA to ARGB.
|
||||
static const uvec8 kShuffleMaskBGRAToARGB = {
|
||||
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
||||
};
|
||||
|
||||
// Shuffle table for converting ABGR to ARGB.
|
||||
static const uvec8 kShuffleMaskABGRToARGB = {
|
||||
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting RGBA to ARGB.
|
||||
static const uvec8 kShuffleMaskRGBAToARGB = {
|
||||
1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
|
||||
@ -137,12 +150,12 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_abgr
|
||||
mov eax, [esp + 4] // src_bgra
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm5, kShuffleMaskABGRToARGB
|
||||
movdqa xmm5, kShuffleMaskBGRAToARGB
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
@ -158,12 +171,12 @@ __asm {
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bgra
|
||||
mov eax, [esp + 4] // src_abgr
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm5, kShuffleMaskBGRAToARGB
|
||||
movdqa xmm5, kShuffleMaskABGRToARGB
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
@ -844,6 +857,74 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm5, kAddY16
|
||||
movdqa xmm4, kRGBAToY
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm5, kAddY16
|
||||
movdqa xmm4, kRGBAToY
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
sub ecx, 16
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -1251,6 +1332,142 @@ __asm {
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, kRGBAToU
|
||||
movdqa xmm6, kRGBAToV
|
||||
movdqa xmm5, kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pavgb xmm0, [eax + esi]
|
||||
pavgb xmm1, [eax + esi + 16]
|
||||
pavgb xmm2, [eax + esi + 32]
|
||||
pavgb xmm3, [eax + esi + 48]
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, kRGBAToU
|
||||
movdqa xmm6, kRGBAToV
|
||||
movdqa xmm5, kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
movdqu xmm4, [eax + esi]
|
||||
pavgb xmm0, xmm4
|
||||
movdqu xmm4, [eax + esi + 16]
|
||||
pavgb xmm1, xmm4
|
||||
movdqu xmm4, [eax + esi + 32]
|
||||
pavgb xmm2, xmm4
|
||||
movdqu xmm4, [eax + esi + 48]
|
||||
pavgb xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
sub ecx, 16
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
@ -1847,47 +2064,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* abgr_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // Y
|
||||
mov esi, [esp + 8 + 8] // U
|
||||
mov edi, [esp + 8 + 12] // V
|
||||
mov edx, [esp + 8 + 16] // abgr
|
||||
mov ecx, [esp + 8 + 20] // width
|
||||
sub edi, esi
|
||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||
pxor xmm4, xmm4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm2, xmm1 // RG
|
||||
punpcklbw xmm0, xmm5 // BA
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
|
||||
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
|
||||
movdqa [edx], xmm2
|
||||
movdqa [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
@ -1929,6 +2105,47 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* abgr_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // Y
|
||||
mov esi, [esp + 8 + 8] // U
|
||||
mov edi, [esp + 8 + 12] // V
|
||||
mov edx, [esp + 8 + 16] // abgr
|
||||
mov ecx, [esp + 8 + 20] // width
|
||||
sub edi, esi
|
||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||
pxor xmm4, xmm4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into ARGB
|
||||
punpcklbw xmm2, xmm1 // RG
|
||||
punpcklbw xmm0, xmm5 // BA
|
||||
movdqa xmm1, xmm2
|
||||
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
|
||||
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
|
||||
movdqa [edx], xmm2
|
||||
movdqa [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
@ -1969,6 +2186,89 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToRGBARow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgba_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // Y
|
||||
mov esi, [esp + 8 + 8] // U
|
||||
mov edi, [esp + 8 + 12] // V
|
||||
mov edx, [esp + 8 + 16] // rgba
|
||||
mov ecx, [esp + 8 + 20] // width
|
||||
sub edi, esi
|
||||
pxor xmm4, xmm4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into RGBA
|
||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||
punpcklbw xmm1, xmm2 // GR
|
||||
punpcklbw xmm5, xmm0 // AB
|
||||
movdqa xmm0, xmm5
|
||||
punpcklwd xmm5, xmm1 // RGBA first 4 pixels
|
||||
punpckhwd xmm0, xmm1 // RGBA next 4 pixels
|
||||
movdqa [edx], xmm5
|
||||
movdqa [edx + 16], xmm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgba_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // Y
|
||||
mov esi, [esp + 8 + 8] // U
|
||||
mov edi, [esp + 8 + 12] // V
|
||||
mov edx, [esp + 8 + 16] // rgba
|
||||
mov ecx, [esp + 8 + 20] // width
|
||||
sub edi, esi
|
||||
pxor xmm4, xmm4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
READYUV422
|
||||
YUVTORGB
|
||||
|
||||
// Step 3: Weave into RGBA
|
||||
pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
|
||||
punpcklbw xmm1, xmm2 // GR
|
||||
punpcklbw xmm5, xmm0 // AB
|
||||
movdqa xmm0, xmm5
|
||||
punpcklwd xmm5, xmm1 // RGBA first 4 pixels
|
||||
punpckhwd xmm0, xmm1 // RGBA next 4 pixels
|
||||
movdqu [edx], xmm5
|
||||
movdqu [edx + 16], xmm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#endif // HAS_I422TOARGBROW_SSSE3
|
||||
|
||||
#ifdef HAS_YTOARGBROW_SSE2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user