diff --git a/README.chromium b/README.chromium index a6e13d7fb..9af93db0a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 190 +Version: 191 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 14be9f165..48186571e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 190 +#define LIBYUV_VERSION 191 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/rotate.cc b/source/rotate.cc index e7cc7ded1..ef399924d 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -80,7 +80,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, __declspec(naked) static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { -__asm { + __asm { push edi push esi push ebp @@ -154,9 +154,9 @@ __asm { movq qword ptr [edx], xmm3 movdqa xmm7, xmm3 palignr xmm7, xmm7, 8 + sub ecx, 8 movq qword ptr [edx + esi], xmm7 lea edx, [edx + 2 * esi] - sub ecx, 8 ja convertloop pop ebp @@ -172,7 +172,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w) { -__asm { + __asm { push ebx push esi push edi @@ -278,11 +278,11 @@ __asm { movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 punpckhdq xmm0, xmm7 + sub ecx, 8 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] - sub ecx, 8 ja convertloop mov esp, [esp + 16] @@ -365,9 +365,9 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride, "movq %%xmm3,(%1) \n" "movdqa %%xmm3,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" "movq %%xmm7,(%1,%4) \n" "lea (%1,%4,2),%1 \n" - "sub $0x8,%2 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -490,11 +490,11 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "movlpd %xmm3,(%edx) \n" "movhpd %xmm3,(%ebx) \n" "punpckhdq %xmm7,%xmm0 \n" + "sub $0x8,%ecx \n" "movlpd %xmm0,(%edx,%esi,1) \n" "lea (%edx,%esi,2),%edx \n" "movhpd %xmm0,(%ebx,%ebp,1) \n" "lea (%ebx,%ebp,2),%ebx \n" - "sub $0x8,%ecx \n" "ja 1b \n" "mov 0x10(%esp),%esp \n" "pop %ebp \n" @@ -628,9 +628,9 @@ static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, "movq %%xmm11,(%1) \n" "movdqa %%xmm11,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" "movq %%xmm15,(%1,%4) \n" "lea (%1,%4,2),%1 \n" - "sub $0x10,%2 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -734,11 +734,11 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, "movlpd %%xmm3,(%1) \n" "movhpd %%xmm3,(%2) \n" "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" "movlpd %%xmm8,(%1,%5) \n" "lea (%1,%5,2),%1 \n" "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" - "sub $0x8,%3 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 @@ -1023,11 +1023,11 @@ __asm { movdqa xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm5 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [edi], xmm0 - lea edx, [edx + 8] - lea edi, [edi + 8] sub ecx, 8 + movlpd qword ptr [edx], xmm0 + lea edx, [edx + 8] + movhpd qword ptr [edi], xmm0 + lea edi, [edi + 8] ja convertloop pop edi ret @@ -1042,18 +1042,18 @@ void MirrorRowUV_SSSE3(const uint8* src, int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "movdqa %4,%%xmm5 \n" - "lea -16(%0,%3,2),%0 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -16(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,(%2) \n" - "lea 8(%1),%1 \n" - "lea 8(%2),%2 \n" - "sub $8,%3 \n" - "ja 1b \n" + "movdqa %4,%%xmm5 \n" + "lea -16(%0,%3,2),%0 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -16(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "sub $8,%3 \n" + "movlpd %%xmm0,(%1) \n" + "lea 8(%1),%1 \n" + "movhpd %%xmm0,(%2) \n" + "lea 8(%2),%2 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 diff --git a/source/scale.cc b/source/scale.cc index b31e0b694..6ac4d48bd 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -692,9 +692,9 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 ja wloop ret @@ -733,9 +733,9 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, pavgw xmm1, xmm3 packuswb xmm0, xmm1 + sub ecx, 16 movdqa [edx], xmm0 lea edx, [edx + 16] - sub ecx, 16 ja wloop pop esi @@ -750,28 +750,26 @@ __declspec(naked) static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr + mov eax, [esp + 4] // src_ptr // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x000000ff psrld xmm5, 24 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] ja wloop - popad ret } } @@ -782,27 +780,28 @@ __declspec(naked) static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff psrlw xmm7, 8 - lea edx, [ebx + ebx * 2] // src_stride * 3 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - movdqa xmm2, [esi + ebx] - movdqa xmm3, [esi + ebx + 16] + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - movdqa xmm2, [esi + ebx * 2] - movdqa xmm3, [esi + ebx * 2 + 16] - movdqa xmm4, [esi + edx] - movdqa xmm5, [esi + edx + 16] - lea esi, [esi + 32] + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 pavgb xmm0, xmm2 @@ -824,12 +823,13 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, pavgw xmm0, xmm2 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - lea edi, [edi + 8] sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] ja wloop - popad + pop edi + pop esi ret } } @@ -841,29 +841,27 @@ __declspec(naked) static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr + mov eax, [esp + 4] // src_ptr // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes psrlq xmm5, 56 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 // 32->16 packuswb xmm0, xmm0 // 16->8 packuswb xmm0, xmm0 // 8->4 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] sub ecx, 4 + movd dword ptr [edx], xmm0 + lea edx, [edx + 4] ja wloop - popad ret } } @@ -874,27 +872,29 @@ __declspec(naked) static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - lea edx, [ebx + ebx * 2] // src_stride * 3 + push esi + push edi + push ebp + mov eax, [esp + 12 + 4] // src_ptr + mov esi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst_ptr + mov ecx, [esp + 12 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 pxor xmm7, xmm7 wloop: - movdqa xmm0, [esi] // average 8 rows to 1 - movdqa xmm1, [esi + 16] - movdqa xmm2, [esi + ebx] - movdqa xmm3, [esi + ebx + 16] + movdqa xmm0, [eax] // average 8 rows to 1 + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + esi] + movdqa xmm3, [eax + esi + 16] pavgb xmm0, xmm2 pavgb xmm1, xmm3 - movdqa xmm2, [esi + ebx * 2] - movdqa xmm3, [esi + ebx * 2 + 16] - movdqa xmm4, [esi + edx] - movdqa xmm5, [esi + edx + 16] - lea ebp, [esi + ebx * 4] - lea esi, [esi + 32] + movdqa xmm2, [eax + esi * 2] + movdqa xmm3, [eax + esi * 2 + 16] + movdqa xmm4, [eax + edi] + movdqa xmm5, [eax + edi + 16] + lea ebp, [eax + esi * 4] + lea eax, [eax + 32] pavgb xmm2, xmm4 pavgb xmm3, xmm5 pavgb xmm0, xmm2 @@ -902,15 +902,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, movdqa xmm2, [ebp] movdqa xmm3, [ebp + 16] - movdqa xmm4, [ebp + ebx] - movdqa xmm5, [ebp + ebx + 16] + movdqa xmm4, [ebp + esi] + movdqa xmm5, [ebp + esi + 16] pavgb xmm2, xmm4 pavgb xmm3, xmm5 - movdqa xmm4, [ebp + ebx * 2] - movdqa xmm5, [ebp + ebx * 2 + 16] - movdqa xmm6, [ebp + edx] + movdqa xmm4, [ebp + esi * 2] + movdqa xmm5, [ebp + esi * 2 + 16] + movdqa xmm6, [ebp + edi] pavgb xmm4, xmm6 - movdqa xmm6, [ebp + edx + 16] + movdqa xmm6, [ebp + edi + 16] pavgb xmm5, xmm6 pavgb xmm2, xmm4 pavgb xmm3, xmm5 @@ -925,13 +925,15 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, psrlw xmm0, 3 packuswb xmm0, xmm0 packuswb xmm0, xmm0 - movd dword ptr [edi], xmm0 - lea edi, [edi + 4] sub ecx, 4 + movd dword ptr [edx], xmm0 + lea edx, [edx + 4] ja wloop - popad + pop ebp + pop edi + pop esi ret } } @@ -947,32 +949,30 @@ __declspec(naked) static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr + mov eax, [esp + 4] // src_ptr // src_stride ignored - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm3, _shuf0 movdqa xmm4, _shuf1 movdqa xmm5, _shuf2 wloop: - movdqa xmm0, [esi] - movdqa xmm1, [esi + 16] - lea esi, [esi + 32] + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 pshufb xmm0, xmm3 pshufb xmm1, xmm4 pshufb xmm2, xmm5 - movq qword ptr [edi], xmm0 - movq qword ptr [edi + 8], xmm1 - movq qword ptr [edi + 16], xmm2 - lea edi, [edi + 24] + movq qword ptr [edx], xmm0 + movq qword ptr [edx + 8], xmm1 + movq qword ptr [edx + 16], xmm2 + lea edx, [edx + 24] sub ecx, 24 ja wloop - popad ret } } @@ -997,11 +997,11 @@ __declspec(naked) static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, _shuf01 movdqa xmm3, _shuf11 movdqa xmm4, _shuf21 @@ -1010,27 +1010,27 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, movdqa xmm7, _round34 wloop: - movdqa xmm0, [esi] // pixels 0..7 - movdqa xmm1, [esi+ebx] + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - movdqu xmm0, [esi+8] // pixels 8..15 - movdqu xmm1, [esi+ebx+8] + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 pmaddubsw xmm0, xmm6 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+8], xmm0 - movdqa xmm0, [esi+16] // pixels 16..23 - movdqa xmm1, [esi+ebx+16] - lea esi, [esi+32] + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 movdqa xmm1, _madd21 @@ -1038,12 +1038,12 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+16], xmm0 - lea edi, [edi+24] sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx + 24] ja wloop - popad + pop esi ret } } @@ -1054,11 +1054,11 @@ __declspec(naked) static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov ebx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, _shuf01 movdqa xmm3, _shuf11 movdqa xmm4, _shuf21 @@ -1067,8 +1067,8 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, movdqa xmm7, _round34 wloop: - movdqa xmm0, [esi] // pixels 0..7 - movdqa xmm1, [esi+ebx] + movdqa xmm0, [eax] // pixels 0..7 + movdqa xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 @@ -1076,9 +1076,9 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi], xmm0 - movdqu xmm0, [esi+8] // pixels 8..15 - movdqu xmm1, [esi+ebx+8] + movq qword ptr [edx], xmm0 + movdqu xmm0, [eax + 8] // pixels 8..15 + movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm3 @@ -1086,10 +1086,10 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+8], xmm0 - movdqa xmm0, [esi+16] // pixels 16..23 - movdqa xmm1, [esi+ebx+16] - lea esi, [esi+32] + movq qword ptr [edx + 8], xmm0 + movdqa xmm0, [eax + 16] // pixels 16..23 + movdqa xmm1, [eax + esi + 16] + lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm4 @@ -1098,12 +1098,12 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 - movq qword ptr [edi+16], xmm0 - lea edi, [edi+24] sub ecx, 24 + movq qword ptr [edx + 16], xmm0 + lea edx, [edx+24] ja wloop - popad + pop esi ret } } @@ -1116,30 +1116,28 @@ __declspec(naked) static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width movdqa xmm4, _shuf38a movdqa xmm5, _shuf38b xloop: - movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 - movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 - lea esi, [esi + 32] + movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 + movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 + lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 - movq qword ptr [edi], xmm0 // write 12 pixels - movhlps xmm1, xmm0 - movd [edi + 8], xmm1 - lea edi, [edi + 12] sub ecx, 12 + movq qword ptr [edx], xmm0 // write 12 pixels + movhlps xmm1, xmm0 + movd [edx + 8], xmm1 + lea edx, [edx + 12] ja xloop - popad ret } } @@ -1149,19 +1147,20 @@ __declspec(naked) static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + push esi + push ebx + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width movdqa xmm4, _shufac0 movdqa xmm5, _shufac3 movdqa xmm6, _scaleac3 pxor xmm7, xmm7 xloop: - movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 - movdqa xmm2, [esi + edx] + movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 + movdqa xmm2, [eax + esi] movhlps xmm1, xmm0 movhlps xmm3, xmm2 punpcklbw xmm0, xmm7 @@ -1170,8 +1169,8 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, punpcklbw xmm3, xmm7 paddusw xmm0, xmm2 paddusw xmm1, xmm3 - movdqa xmm2, [esi + edx * 2] - lea esi, [esi + 16] + movdqa xmm2, [eax + esi * 2] + lea eax, [eax + 16] movhlps xmm3, xmm2 punpcklbw xmm2, xmm7 punpcklbw xmm3, xmm7 @@ -1196,14 +1195,15 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 packuswb xmm2, xmm2 - movd [edi], xmm2 // write 6 pixels - pextrw eax, xmm2, 2 - mov [edi + 4], ax - lea edi, [edi + 6] sub ecx, 6 + movd [edx], xmm2 // write 6 pixels + pextrw ebx, xmm2, 2 + mov [edx + 4], bx + lea edx, [edx + 6] ja xloop - popad + pop ebx + pop esi ret } } @@ -1213,20 +1213,21 @@ __declspec(naked) static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width + push esi + push ebx + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width movdqa xmm4, _shufab0 movdqa xmm5, _shufab1 movdqa xmm6, _shufab2 movdqa xmm7, _scaleab2 xloop: - movdqa xmm2, [esi] // average 2 rows into xmm2 - pavgb xmm2, [esi + edx] - lea esi, [esi + 16] + movdqa xmm2, [eax] // average 2 rows into xmm2 + pavgb xmm2, [eax + esi] + lea eax, [eax + 16] movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 pshufb xmm0, xmm4 @@ -1239,65 +1240,72 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 packuswb xmm0, xmm0 - movd [edi], xmm0 // write 6 pixels - pextrw eax, xmm0, 2 - mov [edi + 4], ax - lea edi, [edi + 6] sub ecx, 6 + movd [edx], xmm0 // write 6 pixels + pextrw ebx, xmm0, 2 + mov [edx + 4], bx + lea edx, [edx + 6] ja xloop - popad + pop ebx + pop esi ret } } #define HAS_SCALEADDROWS_SSE2 -// Reads 8xN bytes and produces 16 shorts at a time. +// Reads 16xN bytes and produces 16 shorts at a time. __declspec(naked) static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, uint16* dst_ptr, int src_width, int src_height) { __asm { - pushad - mov esi, [esp + 32 + 4] // src_ptr - mov edx, [esp + 32 + 8] // src_stride - mov edi, [esp + 32 + 12] // dst_ptr - mov ecx, [esp + 32 + 16] // dst_width - mov ebx, [esp + 32 + 20] // height - pxor xmm5, xmm5 + push esi + push edi + push ebx + push ebp + mov esi, [esp + 16 + 4] // src_ptr + mov edx, [esp + 16 + 8] // src_stride + mov edi, [esp + 16 + 12] // dst_ptr + mov ecx, [esp + 16 + 16] // dst_width + mov ebx, [esp + 16 + 20] // height + pxor xmm4, xmm4 dec ebx xloop: // first row - movdqa xmm2, [esi] + movdqa xmm0, [esi] lea eax, [esi + edx] - movhlps xmm3, xmm2 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm4 + punpckhbw xmm1, xmm4 + lea esi, [esi + 16] mov ebp, ebx - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 // sum remaining rows yloop: - movdqa xmm0, [eax] // read 16 pixels + movdqa xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row - movhlps xmm1, xmm0 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - paddusw xmm2, xmm0 // sum 16 words - paddusw xmm3, xmm1 + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm4 + punpckhbw xmm3, xmm4 + paddusw xmm0, xmm2 // sum 16 words + paddusw xmm1, xmm3 sub ebp, 1 ja yloop - movdqa [edi], xmm2 - movdqa [edi + 16], xmm3 + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 lea edi, [edi + 32] - lea esi, [esi + 16] sub ecx, 16 ja xloop - popad + pop ebp + pop ebx + pop edi + pop esi ret } } @@ -1508,9 +1516,9 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, paddsw xmm0, xmm1 psrlw xmm0, 2 packuswb xmm0, xmm0 + sub ecx, 24 movq qword ptr [edx+16], xmm0 lea edx, [edx+24] - sub ecx, 24 ja wloop ret } @@ -1527,7 +1535,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" -"1:" +"1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -1551,7 +1559,7 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" -"1:" +"1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "movdqa (%0,%3,1),%%xmm2 \n" @@ -1586,7 +1594,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" -"1:" +"1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" @@ -1613,7 +1621,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0x8,%%xmm7 \n" "lea (%4,%4,2),%3 \n" -"1:" +"1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "movdqa (%0,%4,1),%%xmm2 \n" @@ -1663,27 +1671,72 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlq $0x38,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "sub $0x4,%2 \n" - "ja 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlq $0x38,%%xmm5 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "memory", "cc" -); + ); +} + +#define HAS_SCALEADDROWS_SSE2 +static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, + uint16* dst_ptr, int src_width, int src_height) { + int tmp_height = 0; + intptr_t tmp_src = 0; + asm volatile ( + "pxor %%xmm4,%%xmm4 \n" + "sub $0x1,%3 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea (%0,%6,1),%5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "punpckhbw %%xmm4,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "mov %3,%4 \n" + "2: \n" + "movdqa (%5),%%xmm2 \n" + "lea (%5,%6,1),%5 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "sub $0x1,%4 \n" + "ja 2b \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+rm"(src_width), // %2 + "+rm"(src_height), // %3 + "+r"(tmp_height), // %4 + "+r"(tmp_src) // %5 + : "r"(static_cast(src_stride)) // %6 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } #if defined(__i386__) @@ -1740,9 +1793,9 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, "psrlw $0x3,%xmm0 \n" "packuswb %xmm0,%xmm0 \n" "packuswb %xmm0,%xmm0 \n" + "sub $0x4,%ecx \n" "movd %xmm0,(%edi) \n" "lea 0x4(%edi),%edi \n" - "sub $0x4,%ecx \n" "ja 1b \n" "popa \n" "ret \n" @@ -1827,9 +1880,9 @@ extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, "paddsw %xmm7,%xmm0 \n" "psrlw $0x2,%xmm0 \n" "packuswb %xmm0,%xmm0 \n" + "sub $0x18,%ecx \n" "movq %xmm0,0x10(%edi) \n" "lea 0x18(%edi),%edi \n" - "sub $0x18,%ecx \n" "ja 1b \n" "popa \n" @@ -1884,9 +1937,9 @@ extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, "paddsw %xmm7,%xmm0 \n" "psrlw $0x2,%xmm0 \n" "packuswb %xmm0,%xmm0 \n" + "sub $0x18,%ecx \n" "movq %xmm0,0x10(%edi) \n" "lea 0x18(%edi),%edi \n" - "sub $0x18,%ecx \n" "ja 1b \n" "popa \n" "ret \n" @@ -1914,9 +1967,9 @@ extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, "paddusb %xmm1,%xmm0 \n" "movq %xmm0,(%edi) \n" "movhlps %xmm0,%xmm1 \n" + "sub $0xc,%ecx \n" "movd %xmm1,0x8(%edi) \n" "lea 0xc(%edi),%edi \n" - "sub $0xc,%ecx \n" "ja 1b \n" "popa \n" "ret \n" @@ -2017,49 +2070,6 @@ extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ); #endif // __PIC__ -#define HAS_SCALEADDROWS_SSE2 -extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height); - asm( - DECLARE_FUNCTION(ScaleAddRows_SSE2) - "pusha \n" - "mov 0x24(%esp),%esi \n" - "mov 0x28(%esp),%edx \n" - "mov 0x2c(%esp),%edi \n" - "mov 0x30(%esp),%ecx \n" - "mov 0x34(%esp),%ebx \n" - "pxor %xmm5,%xmm5 \n" - -"1:" - "movdqa (%esi),%xmm2 \n" - "lea (%esi,%edx,1),%eax \n" - "movhlps %xmm2,%xmm3 \n" - "lea -0x1(%ebx),%ebp \n" - "punpcklbw %xmm5,%xmm2 \n" - "punpcklbw %xmm5,%xmm3 \n" - -"2:" - "movdqa (%eax),%xmm0 \n" - "lea (%eax,%edx,1),%eax \n" - "movhlps %xmm0,%xmm1 \n" - "punpcklbw %xmm5,%xmm0 \n" - "punpcklbw %xmm5,%xmm1 \n" - "paddusw %xmm0,%xmm2 \n" - "paddusw %xmm1,%xmm3 \n" - "sub $0x1,%ebp \n" - "ja 2b \n" - - "movdqa %xmm2,(%edi) \n" - "movdqa %xmm3,0x10(%edi) \n" - "lea 0x20(%edi),%edi \n" - "lea 0x10(%esi),%esi \n" - "sub $0x10,%ecx \n" - "ja 1b \n" - "popa \n" - "ret \n" -); - // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version #define HAS_SCALEFILTERROWS_SSE2 extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, @@ -2554,46 +2564,6 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, ); } -#define HAS_SCALEADDROWS_SSE2 -static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, - uint16* dst_ptr, int src_width, - int src_height) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" -"1:" - "movdqa (%0),%%xmm2 \n" - "lea (%0,%4,1),%%r10 \n" - "movhlps %%xmm2,%%xmm3 \n" - "lea -0x1(%3),%%r11 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm3 \n" - -"2:" - "movdqa (%%r10),%%xmm0 \n" - "lea (%%r10,%4,1),%%r10 \n" - "movhlps %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "punpcklbw %%xmm5,%%xmm1 \n" - "paddusw %%xmm0,%%xmm2 \n" - "paddusw %%xmm1,%%xmm3 \n" - "sub $0x1,%%r11 \n" - "ja 2b \n" - - "movdqa %%xmm2,(%1) \n" - "movdqa %%xmm3,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "lea 0x10(%0),%0 \n" - "sub $0x10,%2 \n" - "ja 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width), // %2 - "+r"(src_height) // %3 - : "r"(static_cast(src_stride)) // %4 - : "memory", "cc", "r10", "r11" -); -} - // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version #define HAS_SCALEFILTERROWS_SSE2 static void ScaleFilterRows_SSE2(uint8* dst_ptr,