diff --git a/README.chromium b/README.chromium index 6f8b66b50..6280fad0d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 199 +Version: 200 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 2510546a3..dcf55aab9 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 199 +#define LIBYUV_VERSION 200 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/rotate.cc b/source/rotate.cc index ef399924d..4d186c06e 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -298,87 +298,87 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. -"1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "ja 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(static_cast(src_stride)), // %3 - "r"(static_cast(dst_stride)) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif -); + // Read in the data from the source pointer. + // First round of bit swap. + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "ja 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(static_cast(src_stride)), // %3 + "r"(static_cast(dst_stride)) // %4 + : "memory", "cc" + #if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + #endif + ); } #if defined (__i386__) @@ -755,6 +755,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, #endif #endif + static void TransposeWx8_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, int w) { @@ -1007,28 +1008,28 @@ void RotateUV270(const uint8* src, int src_stride, #if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) #define HAS_MIRRORROW_UV_SSSE3 __declspec(naked) -void MirrorRowUV_SSSE3(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { -__asm { +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b, + int width) { + __asm { push edi mov eax, [esp + 4 + 4] // src mov edx, [esp + 4 + 8] // dst_a mov edi, [esp + 4 + 12] // dst_b mov ecx, [esp + 4 + 16] // width - movdqa xmm5, kShuffleMirrorUV + movdqa xmm1, kShuffleMirrorUV lea eax, [eax + ecx * 2 - 16] + sub edi, edx convertloop: movdqa xmm0, [eax] lea eax, [eax - 16] - pshufb xmm0, xmm5 + pshufb xmm0, xmm1 sub ecx, 8 movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 lea edx, [edx + 8] - movhpd qword ptr [edi], xmm0 - lea edi, [edi + 8] ja convertloop + pop edi ret } @@ -1037,22 +1038,21 @@ __asm { #elif (defined(__i386__) || defined(__x86_64__)) && \ !defined(YUV_DISABLE_ASM) #define HAS_MIRRORROW_UV_SSSE3 -void MirrorRowUV_SSSE3(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b, + int width) { intptr_t temp_width = static_cast(width); asm volatile ( - "movdqa %4,%%xmm5 \n" + "movdqa %4,%%xmm1 \n" "lea -16(%0,%3,2),%0 \n" + "sub %1,%2 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "lea -16(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm1,%%xmm0 \n" "sub $8,%3 \n" "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,(%1,%2) \n" "lea 8(%1),%1 \n" - "movhpd %%xmm0,(%2) \n" - "lea 8(%2),%2 \n" "ja 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 @@ -1061,7 +1061,7 @@ void MirrorRowUV_SSSE3(const uint8* src, : "m"(kShuffleMirrorUV) // %4 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm5" + , "xmm0", "xmm1" #endif ); } @@ -1070,12 +1070,11 @@ void MirrorRowUV_SSSE3(const uint8* src, static void MirrorRowUV_C(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { - int i; - src += width << 1; - for (i = 0; i < width; ++i) { - src -= 2; + src += (width << 1) - 2; + for (int i = 0; i < width; ++i) { dst_a[i] = src[0]; dst_b[i] = src[1]; + src -= 2; } } @@ -1083,7 +1082,6 @@ void RotateUV180(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width, int height) { - int i; mirror_uv_func MirrorRow; #if defined(HAS_MIRRORROW_UV_NEON) @@ -1105,12 +1103,11 @@ void RotateUV180(const uint8* src, int src_stride, dst_a += dst_stride_a * (height - 1); dst_b += dst_stride_b * (height - 1); - for (i = 0; i < height; ++i) { + for (int i = 0; i < height; ++i) { MirrorRow(src, dst_a, dst_b, width); - - src += src_stride; // down one line at a time - dst_a -= dst_stride_a; // nominally up one line at a time - dst_b -= dst_stride_b; // nominally up one line at a time + src += src_stride; + dst_a -= dst_stride_a; + dst_b -= dst_stride_b; } } diff --git a/source/scale.cc b/source/scale.cc index 018fcd13b..ab4735790 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1701,15 +1701,15 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, intptr_t tmp_src = 0; asm volatile ( "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%3 \n" + "sub $0x1,%5 \n" "1: \n" "movdqa (%0),%%xmm0 \n" - "mov %0,%5 \n" + "mov %0,%3 \n" "add %6,%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm4,%%xmm0 \n" "punpckhbw %%xmm4,%%xmm1 \n" - "mov %3,%4 \n" + "mov %5,%2 \n" "2: \n" "movdqa (%0),%%xmm2 \n" "add %6,%0 \n" @@ -1718,21 +1718,21 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, "punpckhbw %%xmm4,%%xmm3 \n" "paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm3,%%xmm1 \n" - "sub $0x1,%4 \n" + "sub $0x1,%2 \n" "ja 2b \n" "movdqa %%xmm0,(%1) \n" "movdqa %%xmm1,0x10(%1) \n" - "lea 0x10(%5),%0 \n" + "lea 0x10(%3),%0 \n" "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" + "sub $0x10,%4 \n" "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+rm"(src_width), // %2 - "+rm"(src_height), // %3 - "+r"(tmp_height), // %4 - "+r"(tmp_src) // %5 - : "rm"(static_cast(src_stride)) // %6 + "+r"(tmp_height), // %2 + "+r"(tmp_src), // %3 + "+rm"(src_width), // %4 + "+rm"(src_height) // %5 + : "r"(static_cast(src_stride)) // %6 : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" @@ -1740,6 +1740,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, ); } + #if defined(__i386__) extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); @@ -2886,7 +2887,6 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, // (1-f)a + fb can be replaced with a + f(b-a) #define BLENDER(a, b, f) ((int)(a) + ((f) * ((int)(b) - (int)(a)) >> 16)) -// TODO(fbarchard): consider +0x8000 for rounding if it can be done for free. static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { for (int j = 0; j < dst_width - 1; j += 2) {