mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
TEST=talk unittests BUG=none Review URL: http://webrtc-codereview.appspot.com/251003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@46 16f28f9a-4ce2-e073-06de-1de4eb20be90
376 lines
11 KiB
C++
376 lines
11 KiB
C++
/*
|
|
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "row.h"
|
|
|
|
extern "C" {
|
|
|
|
#ifdef HAS_ARGBTOYROW_SSSE3
|
|
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
|
|
|
|
// Constant multiplication table for converting ARGB to I400.
|
|
extern "C" TALIGN16(const int8, kRGBToY[16]) = {
|
|
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
|
};
|
|
|
|
extern "C" TALIGN16(const int8, kRGBToU[16]) = {
|
|
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
|
};
|
|
|
|
extern "C" TALIGN16(const int8, kRGBToV[16]) = {
|
|
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
|
};
|
|
|
|
extern "C" TALIGN16(const uint8, kAddY16[16]) = {
|
|
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
|
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
|
};
|
|
|
|
extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
|
|
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
|
|
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
|
|
};
|
|
|
|
__declspec(naked)
|
|
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
__asm {
|
|
mov eax, [esp + 4] // src_argb
|
|
mov edx, [esp + 8] // dst_y
|
|
mov ecx, [esp + 12] // pix
|
|
movdqa xmm7, _kRGBToY
|
|
movdqa xmm6, _kAddY16
|
|
pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
|
|
psrld xmm5, 16
|
|
|
|
convertloop :
|
|
movdqa xmm0, [eax]
|
|
movdqa xmm1, [eax + 16]
|
|
pmaddubsw xmm0, xmm7
|
|
lea eax, [eax + 32]
|
|
pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra
|
|
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
|
|
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
|
|
pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
|
palignr xmm3, xmm1, 2
|
|
paddw xmm3, xmm1
|
|
pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
|
packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
|
|
psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx
|
|
packuswb xmm2, xmm2
|
|
paddb xmm2, xmm6
|
|
movq qword ptr [edx], xmm2
|
|
lea edx, [edx + 8]
|
|
sub ecx, 8
|
|
ja convertloop
|
|
ret
|
|
}
|
|
}
|
|
|
|
__declspec(naked)
|
|
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
__asm {
|
|
push esi
|
|
push edi
|
|
mov eax, [esp + 8 + 4] // src_argb
|
|
mov esi, [esp + 8 + 8] // src_stride_argb
|
|
mov edx, [esp + 8 + 12] // dst_u
|
|
mov edi, [esp + 8 + 16] // dst_v
|
|
mov ecx, [esp + 8 + 20] // pix
|
|
movdqa xmm7, _kRGBToU
|
|
movdqa xmm6, _kRGBToV
|
|
movdqa xmm5, _kAddUV128
|
|
pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff
|
|
psrld xmm4, 16
|
|
|
|
convertloop :
|
|
// step 1 - subsample 8x2 argb pixels to 4x1
|
|
movdqa xmm0, [eax] // 32x2 -> 32x1
|
|
movdqa xmm1, [eax + 16]
|
|
movdqa xmm2, [eax + esi]
|
|
movdqa xmm3, [eax + esi + 16]
|
|
lea eax, [eax + 32]
|
|
pavgb xmm0, xmm2
|
|
pavgb xmm1, xmm3
|
|
|
|
movdqa xmm2, xmm0 // 32x1 -> 16x1
|
|
shufps xmm0, xmm1, 0x88
|
|
shufps xmm2, xmm1, 0xdd
|
|
pavgb xmm0, xmm2
|
|
|
|
// step 2 - convert to U and V
|
|
// from here down is very similar to Y code except
|
|
// instead of 8 different pixels, its 4 pixels of U and 4 of V
|
|
movdqa xmm1, xmm0
|
|
pmaddubsw xmm0, xmm7 // U
|
|
pmaddubsw xmm1, xmm6 // V
|
|
|
|
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
|
|
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
|
|
pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
|
|
|
palignr xmm3, xmm1, 2
|
|
paddw xmm3, xmm1
|
|
pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
|
|
|
psraw xmm2, 8
|
|
psraw xmm3, 8
|
|
packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
|
|
paddb xmm2, xmm5 // -> unsigned
|
|
packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V
|
|
|
|
// step 3 - store 4 U and 4 V values
|
|
movd dword ptr [edx], xmm2 // U
|
|
lea edx, [edx + 4]
|
|
pshufd xmm0, xmm2, 0x55 // V
|
|
movd dword ptr [edi], xmm0
|
|
lea edi, [edi + 4]
|
|
sub ecx, 8
|
|
ja convertloop
|
|
pop edi
|
|
pop esi
|
|
ret
|
|
}
|
|
}
|
|
|
|
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
|
|
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
|
|
}
|
|
|
|
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
|
|
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
|
|
}
|
|
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
|
|
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
|
|
}
|
|
|
|
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
|
|
for (int x = 0; x < width; ++x) {
|
|
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
|
|
src_argb0 += 4;
|
|
dst_y += 1;
|
|
}
|
|
}
|
|
|
|
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
const uint8* src_argb1 = src_argb0 + src_stride_argb;
|
|
for (int x = 0; x < width - 1; x += 2) {
|
|
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
|
|
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
|
|
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
|
|
dst_u[0] = RGBToU(ar, ag, ab);
|
|
dst_v[0] = RGBToV(ar, ag, ab);
|
|
src_argb0 += 8;
|
|
src_argb1 += 8;
|
|
dst_u += 1;
|
|
dst_v += 1;
|
|
}
|
|
if (width & 1) {
|
|
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
|
|
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
|
|
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
|
|
dst_u[0] = RGBToU(ar, ag, ab);
|
|
dst_v[0] = RGBToV(ar, ag, ab);
|
|
}
|
|
}
|
|
|
|
__declspec(naked)
|
|
void FastConvertYUVToRGB32Row(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
__asm {
|
|
pushad
|
|
mov edx, [esp + 32 + 4]
|
|
mov edi, [esp + 32 + 8]
|
|
mov esi, [esp + 32 + 12]
|
|
mov ebp, [esp + 32 + 16]
|
|
mov ecx, [esp + 32 + 20]
|
|
|
|
convertloop :
|
|
movzx eax, byte ptr [edi]
|
|
lea edi, [edi + 1]
|
|
movzx ebx, byte ptr [esi]
|
|
lea esi, [esi + 1]
|
|
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
|
|
movzx eax, byte ptr [edx]
|
|
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
|
|
movzx ebx, byte ptr [edx + 1]
|
|
movq mm1, [_kCoefficientsRgbY + 8 * eax]
|
|
lea edx, [edx + 2]
|
|
movq mm2, [_kCoefficientsRgbY + 8 * ebx]
|
|
paddsw mm1, mm0
|
|
paddsw mm2, mm0
|
|
psraw mm1, 6
|
|
psraw mm2, 6
|
|
packuswb mm1, mm2
|
|
movntq [ebp], mm1
|
|
lea ebp, [ebp + 8]
|
|
sub ecx, 2
|
|
ja convertloop
|
|
|
|
popad
|
|
ret
|
|
}
|
|
}
|
|
|
|
__declspec(naked)
|
|
void FastConvertYUVToBGRARow(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
__asm {
|
|
pushad
|
|
mov edx, [esp + 32 + 4]
|
|
mov edi, [esp + 32 + 8]
|
|
mov esi, [esp + 32 + 12]
|
|
mov ebp, [esp + 32 + 16]
|
|
mov ecx, [esp + 32 + 20]
|
|
|
|
convertloop :
|
|
movzx eax, byte ptr [edi]
|
|
lea edi, [edi + 1]
|
|
movzx ebx, byte ptr [esi]
|
|
lea esi, [esi + 1]
|
|
movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
|
|
movzx eax, byte ptr [edx]
|
|
paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
|
|
movzx ebx, byte ptr [edx + 1]
|
|
movq mm1, [_kCoefficientsBgraY + 8 * eax]
|
|
lea edx, [edx + 2]
|
|
movq mm2, [_kCoefficientsBgraY + 8 * ebx]
|
|
paddsw mm1, mm0
|
|
paddsw mm2, mm0
|
|
psraw mm1, 6
|
|
psraw mm2, 6
|
|
packuswb mm1, mm2
|
|
movntq [ebp], mm1
|
|
lea ebp, [ebp + 8]
|
|
sub ecx, 2
|
|
ja convertloop
|
|
|
|
popad
|
|
ret
|
|
}
|
|
}
|
|
|
|
__declspec(naked)
|
|
void FastConvertYUVToABGRRow(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
__asm {
|
|
pushad
|
|
mov edx, [esp + 32 + 4]
|
|
mov edi, [esp + 32 + 8]
|
|
mov esi, [esp + 32 + 12]
|
|
mov ebp, [esp + 32 + 16]
|
|
mov ecx, [esp + 32 + 20]
|
|
|
|
convertloop :
|
|
movzx eax, byte ptr [edi]
|
|
lea edi, [edi + 1]
|
|
movzx ebx, byte ptr [esi]
|
|
lea esi, [esi + 1]
|
|
movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
|
|
movzx eax, byte ptr [edx]
|
|
paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
|
|
movzx ebx, byte ptr [edx + 1]
|
|
movq mm1, [_kCoefficientsAbgrY + 8 * eax]
|
|
lea edx, [edx + 2]
|
|
movq mm2, [_kCoefficientsAbgrY + 8 * ebx]
|
|
paddsw mm1, mm0
|
|
paddsw mm2, mm0
|
|
psraw mm1, 6
|
|
psraw mm2, 6
|
|
packuswb mm1, mm2
|
|
movntq [ebp], mm1
|
|
lea ebp, [ebp + 8]
|
|
sub ecx, 2
|
|
ja convertloop
|
|
|
|
popad
|
|
ret
|
|
}
|
|
}
|
|
|
|
__declspec(naked)
|
|
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
__asm {
|
|
pushad
|
|
mov edx, [esp + 32 + 4] // Y
|
|
mov edi, [esp + 32 + 8] // U
|
|
mov esi, [esp + 32 + 12] // V
|
|
mov ebp, [esp + 32 + 16] // rgb
|
|
mov ecx, [esp + 32 + 20] // width
|
|
|
|
convertloop :
|
|
movzx eax, byte ptr [edi]
|
|
lea edi, [edi + 1]
|
|
movzx ebx, byte ptr [esi]
|
|
lea esi, [esi + 1]
|
|
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
|
|
movzx eax, byte ptr [edx]
|
|
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
|
|
lea edx, [edx + 1]
|
|
paddsw mm0, [_kCoefficientsRgbY + 8 * eax]
|
|
psraw mm0, 6
|
|
packuswb mm0, mm0
|
|
movd [ebp], mm0
|
|
lea ebp, [ebp + 4]
|
|
sub ecx, 1
|
|
ja convertloop
|
|
|
|
popad
|
|
ret
|
|
}
|
|
}
|
|
|
|
__declspec(naked)
|
|
void FastConvertYToRGB32Row(const uint8* y_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
__asm {
|
|
push ebx
|
|
mov eax, [esp + 4 + 4] // Y
|
|
mov edx, [esp + 4 + 8] // rgb
|
|
mov ecx, [esp + 4 + 12] // width
|
|
|
|
convertloop :
|
|
movzx ebx, byte ptr [eax]
|
|
movq mm0, [_kCoefficientsRgbY + 8 * ebx]
|
|
psraw mm0, 6
|
|
movzx ebx, byte ptr [eax + 1]
|
|
movq mm1, [_kCoefficientsRgbY + 8 * ebx]
|
|
psraw mm1, 6
|
|
packuswb mm0, mm1
|
|
lea eax, [eax + 2]
|
|
movq [edx], mm0
|
|
lea edx, [edx + 8]
|
|
sub ecx, 2
|
|
ja convertloop
|
|
|
|
pop ebx
|
|
ret
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
} // extern "C"
|