libyuv/source/row_win.cc
fbarchard@google.com 585a126140 rewrite ARGBToI420 with SSSE3
TEST=talk unittests
BUG=none
Review URL: http://webrtc-codereview.appspot.com/251003

git-svn-id: http://libyuv.googlecode.com/svn/trunk@46 16f28f9a-4ce2-e073-06de-1de4eb20be90
2011-10-28 23:51:08 +00:00

376 lines
11 KiB
C++

/*
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "row.h"
extern "C" {
#ifdef HAS_ARGBTOYROW_SSSE3
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
// Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const int8, kRGBToY[16]) = {
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};
extern "C" TALIGN16(const int8, kRGBToU[16]) = {
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};
extern "C" TALIGN16(const int8, kRGBToV[16]) = {
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};
extern "C" TALIGN16(const uint8, kAddY16[16]) = {
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
};
extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
};
__declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_y
mov ecx, [esp + 12] // pix
movdqa xmm7, _kRGBToY
movdqa xmm6, _kAddY16
pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
psrld xmm5, 16
convertloop :
movdqa xmm0, [eax]
movdqa xmm1, [eax + 16]
pmaddubsw xmm0, xmm7
lea eax, [eax + 32]
pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
palignr xmm3, xmm1, 2
paddw xmm3, xmm1
pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx
packuswb xmm2, xmm2
paddb xmm2, xmm6
movq qword ptr [edx], xmm2
lea edx, [edx + 8]
sub ecx, 8
ja convertloop
ret
}
}
__declspec(naked)
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_argb
mov esi, [esp + 8 + 8] // src_stride_argb
mov edx, [esp + 8 + 12] // dst_u
mov edi, [esp + 8 + 16] // dst_v
mov ecx, [esp + 8 + 20] // pix
movdqa xmm7, _kRGBToU
movdqa xmm6, _kRGBToV
movdqa xmm5, _kAddUV128
pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff
psrld xmm4, 16
convertloop :
// step 1 - subsample 8x2 argb pixels to 4x1
movdqa xmm0, [eax] // 32x2 -> 32x1
movdqa xmm1, [eax + 16]
movdqa xmm2, [eax + esi]
movdqa xmm3, [eax + esi + 16]
lea eax, [eax + 32]
pavgb xmm0, xmm2
pavgb xmm1, xmm3
movdqa xmm2, xmm0 // 32x1 -> 16x1
shufps xmm0, xmm1, 0x88
shufps xmm2, xmm1, 0xdd
pavgb xmm0, xmm2
// step 2 - convert to U and V
// from here down is very similar to Y code except
// instead of 8 different pixels, its 4 pixels of U and 4 of V
movdqa xmm1, xmm0
pmaddubsw xmm0, xmm7 // U
pmaddubsw xmm1, xmm6 // V
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
palignr xmm3, xmm1, 2
paddw xmm3, xmm1
pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
psraw xmm2, 8
psraw xmm3, 8
packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
paddb xmm2, xmm5 // -> unsigned
packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V
// step 3 - store 4 U and 4 V values
movd dword ptr [edx], xmm2 // U
lea edx, [edx + 4]
pshufd xmm0, xmm2, 0x55 // V
movd dword ptr [edi], xmm0
lea edi, [edi + 4]
sub ecx, 8
ja convertloop
pop edi
pop esi
ret
}
}
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
}
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
}
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
for (int x = 0; x < width; ++x) {
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
src_argb0 += 4;
dst_y += 1;
}
}
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
uint8* dst_u, uint8* dst_v, int width) {
const uint8* src_argb1 = src_argb0 + src_stride_argb;
for (int x = 0; x < width - 1; x += 2) {
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
src_argb0 += 8;
src_argb1 += 8;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
}
}
__declspec(naked)
void FastConvertYUVToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
pushad
mov edx, [esp + 32 + 4]
mov edi, [esp + 32 + 8]
mov esi, [esp + 32 + 12]
mov ebp, [esp + 32 + 16]
mov ecx, [esp + 32 + 20]
convertloop :
movzx eax, byte ptr [edi]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi]
lea esi, [esi + 1]
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
movzx eax, byte ptr [edx]
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
movzx ebx, byte ptr [edx + 1]
movq mm1, [_kCoefficientsRgbY + 8 * eax]
lea edx, [edx + 2]
movq mm2, [_kCoefficientsRgbY + 8 * ebx]
paddsw mm1, mm0
paddsw mm2, mm0
psraw mm1, 6
psraw mm2, 6
packuswb mm1, mm2
movntq [ebp], mm1
lea ebp, [ebp + 8]
sub ecx, 2
ja convertloop
popad
ret
}
}
__declspec(naked)
void FastConvertYUVToBGRARow(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
pushad
mov edx, [esp + 32 + 4]
mov edi, [esp + 32 + 8]
mov esi, [esp + 32 + 12]
mov ebp, [esp + 32 + 16]
mov ecx, [esp + 32 + 20]
convertloop :
movzx eax, byte ptr [edi]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi]
lea esi, [esi + 1]
movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
movzx eax, byte ptr [edx]
paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
movzx ebx, byte ptr [edx + 1]
movq mm1, [_kCoefficientsBgraY + 8 * eax]
lea edx, [edx + 2]
movq mm2, [_kCoefficientsBgraY + 8 * ebx]
paddsw mm1, mm0
paddsw mm2, mm0
psraw mm1, 6
psraw mm2, 6
packuswb mm1, mm2
movntq [ebp], mm1
lea ebp, [ebp + 8]
sub ecx, 2
ja convertloop
popad
ret
}
}
__declspec(naked)
void FastConvertYUVToABGRRow(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
pushad
mov edx, [esp + 32 + 4]
mov edi, [esp + 32 + 8]
mov esi, [esp + 32 + 12]
mov ebp, [esp + 32 + 16]
mov ecx, [esp + 32 + 20]
convertloop :
movzx eax, byte ptr [edi]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi]
lea esi, [esi + 1]
movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
movzx eax, byte ptr [edx]
paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
movzx ebx, byte ptr [edx + 1]
movq mm1, [_kCoefficientsAbgrY + 8 * eax]
lea edx, [edx + 2]
movq mm2, [_kCoefficientsAbgrY + 8 * ebx]
paddsw mm1, mm0
paddsw mm2, mm0
psraw mm1, 6
psraw mm2, 6
packuswb mm1, mm2
movntq [ebp], mm1
lea ebp, [ebp + 8]
sub ecx, 2
ja convertloop
popad
ret
}
}
__declspec(naked)
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
const uint8* u_buf,
const uint8* v_buf,
uint8* rgb_buf,
int width) {
__asm {
pushad
mov edx, [esp + 32 + 4] // Y
mov edi, [esp + 32 + 8] // U
mov esi, [esp + 32 + 12] // V
mov ebp, [esp + 32 + 16] // rgb
mov ecx, [esp + 32 + 20] // width
convertloop :
movzx eax, byte ptr [edi]
lea edi, [edi + 1]
movzx ebx, byte ptr [esi]
lea esi, [esi + 1]
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
movzx eax, byte ptr [edx]
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
lea edx, [edx + 1]
paddsw mm0, [_kCoefficientsRgbY + 8 * eax]
psraw mm0, 6
packuswb mm0, mm0
movd [ebp], mm0
lea ebp, [ebp + 4]
sub ecx, 1
ja convertloop
popad
ret
}
}
__declspec(naked)
void FastConvertYToRGB32Row(const uint8* y_buf,
uint8* rgb_buf,
int width) {
__asm {
push ebx
mov eax, [esp + 4 + 4] // Y
mov edx, [esp + 4 + 8] // rgb
mov ecx, [esp + 4 + 12] // width
convertloop :
movzx ebx, byte ptr [eax]
movq mm0, [_kCoefficientsRgbY + 8 * ebx]
psraw mm0, 6
movzx ebx, byte ptr [eax + 1]
movq mm1, [_kCoefficientsRgbY + 8 * ebx]
psraw mm1, 6
packuswb mm0, mm1
lea eax, [eax + 2]
movq [edx], mm0
lea edx, [edx + 8]
sub ecx, 2
ja convertloop
pop ebx
ret
}
}
#endif
} // extern "C"