diff --git a/README.chromium b/README.chromium index a78bd81f9..248ac76f8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 746 +Version: 747 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ad6c50761..24f76ba8d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -42,7 +42,11 @@ extern "C" { #define HAS_ARGBADDROW_SSE2 #define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBMULTIPLYROW_SSE2 +#define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 @@ -122,11 +126,7 @@ extern "C" { // Effects: #define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBCOLORMATRIXROW_SSSE3 -#define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBMIRRORROW_SSSE3 -#define HAS_ARGBQUANTIZEROW_SSE2 -#define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 3a78b51c6..a9df9bbbf 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 746 +#define LIBYUV_VERSION 747 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 642a0a71e..0efe91aed 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -21,11 +21,13 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(__native_client__) && defined(__x86_64__) -#define MEMACCESS(x) "%%nacl:(%%r15,%q" #x ")" -#define MEMLEA(x, y) #x "(%q" #y ")" +#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")" +#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")" +#define MEMLEA(offset, base) #offset "(%q" #base ")" #else -#define MEMACCESS(x) "(%" #x ")" -#define MEMLEA(x, y) #x "(%" #y ")" +#define MEMACCESS(base) "(%" #base ")" +#define MEMACCESS2(offset, base) #offset "(%" #base ")" +#define MEMLEA(offset, base) #offset "(%" #base ")" #endif #ifdef HAS_ARGBTOYROW_SSSE3 @@ -3925,21 +3927,21 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - "sub %0,%1 \n" // 8 pixel loop. ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm0 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa (%0),%%xmm2 \n" - "movdqa 0x10(%0),%%xmm3 \n" + "movdqa "MEMACCESS(0)",%%xmm2 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm3 \n" + "lea "MEMLEA(0x20,0)",%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" @@ -3951,9 +3953,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm1 \n" "sub $0x8,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "movdqa %%xmm1,0x10(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,"MEMACCESS(1)" \n" + "movdqa %%xmm1,"MEMACCESS2(0x10,1)" \n" + "lea "MEMLEA(0x20,1)",%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -3995,30 +3997,30 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // 8 pixel loop. ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm6 \n" + "movdqa "MEMACCESS(0)",%%xmm0 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm5 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm5 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm6 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4028,9 +4030,9 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "sub $0x8,%1 \n" - "movdqa %%xmm0,(%0) \n" - "movdqa %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,"MEMACCESS(0)" \n" + "movdqa %%xmm1,"MEMACCESS2(0x10,0)" \n" + "lea "MEMLEA(0x20,0)",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4061,12 +4063,12 @@ void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, // 8 pixel loop. ".p2align 4 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm6 \n" + "movdqa "MEMACCESS(0)",%%xmm0 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm5 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddsw %%xmm6,%%xmm0 \n" @@ -4076,15 +4078,15 @@ void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa (%0),%%xmm5 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm5 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddsw %%xmm1,%%xmm5 \n" "psraw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa 0x10(%0),%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm6 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" @@ -4094,9 +4096,9 @@ void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "sub $0x8,%1 \n" - "movdqa %%xmm0,(%0) \n" - "movdqa %%xmm1,0x10(%0) \n" - "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,"MEMACCESS(0)" \n" + "movdqa %%xmm1,"MEMACCESS2(0x10,0)" \n" + "lea "MEMLEA(0x20,0)",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -4131,7 +4133,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, // 4 pixel loop. ".p2align 2 \n" "1: \n" - "movdqa (%0),%%xmm0 \n" + "movdqa "MEMACCESS(0)",%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" "movdqa (%0),%%xmm1 \n" @@ -4146,8 +4148,8 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "sub $0x4,%1 \n" - "movdqa %%xmm0,(%0) \n" - "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,"MEMACCESS(0)" \n" + "lea "MEMLEA(0x10,0)",%0 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 diff --git a/source/row_win.cc b/source/row_win.cc index 25c07b3f8..f447ecbc5 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4922,7 +4922,6 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { mov ecx, [esp + 12] /* width */ movdqa xmm4, kARGBToYJ movdqa xmm5, kAddYJ64 - sub edx, eax align 16 convertloop: @@ -4936,6 +4935,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { packuswb xmm0, xmm0 // 8 G bytes movdqa xmm2, [eax] // A movdqa xmm3, [eax + 16] + lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 packuswb xmm2, xmm3 @@ -4947,9 +4947,9 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 sub ecx, 8 - movdqa [eax + edx], xmm0 - movdqa [eax + edx + 16], xmm1 - lea eax, [eax + 32] + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] jg convertloop ret }