mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
fix for nv21 u, v order, align all loops, and make addrows support 1 row
BUG=17 TEST=none Review URL: https://webrtc-codereview.appspot.com/435004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@208 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
ba3aeed3b8
commit
bd4a849bcb
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 207
|
||||
Version: 208
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 207
|
||||
#define LIBYUV_VERSION 208
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
|
||||
@ -89,6 +89,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
|
||||
pxor xmm5, xmm5
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm1, [eax]
|
||||
movdqa xmm2, [eax + edx]
|
||||
|
||||
@ -1746,8 +1746,8 @@ int ConvertToI420(const uint8* sample, size_t sample_size,
|
||||
r = NV12ToI420Rotate(src, src_width,
|
||||
src_uv, aligned_src_width,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
u, u_stride,
|
||||
dst_width, inv_dst_height, rotation);
|
||||
break;
|
||||
case FOURCC_M420:
|
||||
|
||||
@ -222,6 +222,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
|
||||
mov ecx, [esp + 8 + 20] // width
|
||||
sub edx, esi
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movq xmm2, qword ptr [esi] // U
|
||||
movq xmm3, qword ptr [esi + edx] // V
|
||||
@ -260,6 +261,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
|
||||
mov ecx, [esp + 8 + 20] // width
|
||||
sub edx, esi
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movq xmm2, qword ptr [esi] // U
|
||||
movq xmm3, qword ptr [esi + edx] // V
|
||||
|
||||
@ -722,6 +722,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
lea ecx, [ebp * 4]
|
||||
sub edx, ecx // stride - width * 4
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
mov ecx, ebp
|
||||
rep stosd
|
||||
|
||||
@ -83,9 +83,11 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
mov edx, [esp + 12 + 12] // dst
|
||||
mov esi, [esp + 12 + 16] // dst_stride
|
||||
mov ecx, [esp + 12 + 20] // width
|
||||
convertloop:
|
||||
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
align 16
|
||||
convertloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea ebp, [eax + 8]
|
||||
movq xmm1, qword ptr [eax + edi]
|
||||
@ -182,6 +184,8 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
and esp, ~15
|
||||
mov [esp + 16], ecx
|
||||
mov ecx, [ecx + 16 + 28] // w
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
|
||||
@ -685,6 +685,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -714,6 +715,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -757,6 +759,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
|
||||
psrld xmm5, 24
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -790,6 +793,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -848,6 +852,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
|
||||
pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes
|
||||
psrlq xmm5, 56
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -882,6 +887,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
lea edi, [esi + esi * 2] // src_stride * 3
|
||||
pxor xmm7, xmm7
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax] // average 8 rows to 1
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -957,6 +963,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
movdqa xmm4, _shuf1
|
||||
movdqa xmm5, _shuf2
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
@ -1009,6 +1016,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
movdqa xmm6, _madd11
|
||||
movdqa xmm7, _round34
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax] // pixels 0..7
|
||||
movdqa xmm1, [eax + esi]
|
||||
@ -1066,6 +1074,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
movdqa xmm6, _madd11
|
||||
movdqa xmm7, _round34
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax] // pixels 0..7
|
||||
movdqa xmm1, [eax + esi]
|
||||
@ -1123,6 +1132,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
movdqa xmm4, _shuf38a
|
||||
movdqa xmm5, _shuf38b
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
|
||||
movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
|
||||
@ -1158,6 +1168,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
movdqa xmm6, _scaleac3
|
||||
pxor xmm7, xmm7
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
|
||||
movdqa xmm2, [eax + esi]
|
||||
@ -1224,6 +1235,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
movdqa xmm6, _shufab2
|
||||
movdqa xmm7, _scaleab2
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
movdqa xmm2, [eax] // average 2 rows into xmm2
|
||||
pavgb xmm2, [eax + esi]
|
||||
@ -1256,8 +1268,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
#define HAS_SCALEADDROWS_SSE2
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
// TODO(fbarchard): support 1 rows
|
||||
// TODO(fbarchard): align loops
|
||||
__declspec(naked)
|
||||
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint16* dst_ptr, int src_width,
|
||||
@ -1275,6 +1285,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
pxor xmm4, xmm4
|
||||
dec ebx
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
// first row
|
||||
movdqa xmm0, [esi]
|
||||
@ -1284,8 +1295,11 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
punpckhbw xmm1, xmm4
|
||||
lea esi, [esi + 16]
|
||||
mov ebp, ebx
|
||||
test ebp, ebp
|
||||
je ydone
|
||||
|
||||
// sum remaining rows
|
||||
align 16
|
||||
yloop:
|
||||
movdqa xmm2, [eax] // read 16 pixels
|
||||
lea eax, [eax + edx] // advance to next row
|
||||
@ -1296,7 +1310,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
paddusw xmm1, xmm3
|
||||
sub ebp, 1
|
||||
ja yloop
|
||||
|
||||
ydone:
|
||||
movdqa [edi], xmm0
|
||||
movdqa [edi + 16], xmm1
|
||||
lea edi, [edi + 32]
|
||||
@ -1342,6 +1356,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
pshufd xmm5, xmm5, 0
|
||||
pxor xmm7, xmm7
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
movdqa xmm0, [esi]
|
||||
movdqa xmm2, [esi + edx]
|
||||
@ -1371,6 +1386,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
pop esi
|
||||
ret
|
||||
|
||||
align 16
|
||||
xloop1:
|
||||
movdqa xmm0, [esi]
|
||||
sub ecx, 16
|
||||
@ -1384,6 +1400,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
pop esi
|
||||
ret
|
||||
|
||||
align 16
|
||||
xloop2:
|
||||
movdqa xmm0, [esi]
|
||||
pavgb xmm0, [esi + edx]
|
||||
@ -1428,6 +1445,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
punpcklwd xmm5, xmm5
|
||||
pshufd xmm5, xmm5, 0
|
||||
|
||||
align 16
|
||||
xloop:
|
||||
movdqa xmm0, [esi]
|
||||
movdqa xmm2, [esi + edx]
|
||||
@ -1450,6 +1468,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
pop esi
|
||||
ret
|
||||
|
||||
align 16
|
||||
xloop1:
|
||||
movdqa xmm0, [esi]
|
||||
sub ecx, 16
|
||||
@ -1463,6 +1482,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
pop esi
|
||||
ret
|
||||
|
||||
align 16
|
||||
xloop2:
|
||||
movdqa xmm0, [esi]
|
||||
pavgb xmm0, [esi + edx]
|
||||
@ -1496,6 +1516,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
movdqa xmm6, _madd11
|
||||
movdqa xmm7, _madd21
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax] // pixels 0..7
|
||||
pshufb xmm0, xmm2
|
||||
@ -1712,6 +1733,8 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
"punpcklbw %%xmm4,%%xmm0 \n"
|
||||
"punpckhbw %%xmm4,%%xmm1 \n"
|
||||
"mov %5,%2 \n"
|
||||
"test %2,%2 \n"
|
||||
"je 3f \n"
|
||||
"2: \n"
|
||||
"movdqa (%0),%%xmm2 \n"
|
||||
"add %6,%0 \n"
|
||||
@ -1722,6 +1745,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
"paddusw %%xmm3,%%xmm1 \n"
|
||||
"sub $0x1,%2 \n"
|
||||
"ja 2b \n"
|
||||
"3: \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm1,0x10(%1) \n"
|
||||
"lea 0x10(%3),%0 \n"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user