diff --git a/Makefile b/Makefile index 8d294a7be..b3efc1806 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # This is a generic makefile for libyuv for gcc. -# Caveat: This file will get overwritten by GYP if projects are generated +# Caveat: This file will get overwritten by GYP if projects are generated # with GYP_GENERATORS=make CC=g++ @@ -34,15 +34,15 @@ LOCAL_OBJ_FILES := \ .cc.o: $(CC) -c $(CCFLAGS) $*.cc -o $*.o -all: libyuv.a convert +all: libyuv.a convert Makefile -libyuv.a: $(LOCAL_OBJ_FILES) +libyuv.a: $(LOCAL_OBJ_FILES) Makefile $(AR) $(ARFLAGS) -o $@ $(LOCAL_OBJ_FILES) # A test utility that uses libyuv conversion. -convert: util/convert.cc +convert: util/convert.cc Makefile $(CC) $(CCFLAGS) -Iutil/ -o $@ util/convert.cc libyuv.a clean: - /bin/rm -f *.o libyuv.a convert + /bin/rm -f source/*.o libyuv.a convert diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 874e19566..32fac0043 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -145,22 +145,26 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #endif -// The following are available on all x86 platforms, including NaCL, but -// require VS2012, clang, gcc4.7 or NaCL. -// Caveat: llvm 3.1 required, but does not provide a version. +// AVX2 functions available on all x86 platforms, but not NaCL, and +// require VS2012, clang 3.4 or gcc 4.7. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) /* Test for GCC >= 4.7.0 */ #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) #define GCC_HAS_AVX2 1 #endif // GNUC >= 4.7 #endif // __GNUC__ -// TODO(fbarchard): Test with new NaCL tool chain. Change __native_client__AVX2 -// to __native_client__ to test. + +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +/* Test for clang >= 3.4.0 */ +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // GNUC >= 4.7 +#endif // __GNUC__ + #if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__)) && \ ((defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700) || \ - ((defined(__x86_64__) || defined(__i386__)) && \ - (defined(__native_client__AVX2) || defined(__clang__) || \ - defined(GCC_HAS_AVX2)))) + defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) // Effects: #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 diff --git a/source/scale.cc b/source/scale.cc index c148032d9..2f039e148 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1689,12 +1689,12 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, asm volatile ( "movd %6,%%xmm2 \n" "movd %7,%%xmm3 \n" - "movl $0x04040000,%k5 \n" - "movd %k5,%%xmm5 \n" + "movl $0x04040000,%k2 \n" + "movd %k2,%%xmm5 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" "pextrw $0x1,%%xmm2,%k3 \n" - "subl $0x2,%2 \n" + "subl $0x2,%5 \n" "jl 29f \n" "movdqa %%xmm2,%%xmm0 \n" "paddd %%xmm3,%%xmm0 \n" @@ -1706,11 +1706,11 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "2: \n" "movdqa %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm2 \n" - "movzwl (%1,%3,1),%k5 \n" - "movd %k5,%%xmm0 \n" + "movzwl (%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" - "movzwl (%1,%4,1),%k5 \n" - "movd %k5,%%xmm4 \n" + "movzwl (%1,%4,1),%k2 \n" + "movd %k2,%%xmm4 \n" "pshufb %%xmm5,%%xmm1 \n" "punpcklwd %%xmm4,%%xmm0 \n" "pxor %%xmm6,%%xmm1 \n" @@ -1719,32 +1719,32 @@ static void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, "pextrw $0x3,%%xmm2,%k4 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "mov %w5,(%0) \n" + "movd %%xmm0,%k2 \n" + "mov %w2,(%0) \n" "lea 0x2(%0),%0 \n" - "sub $0x2,%2 \n" + "sub $0x2,%5 \n" "jge 2b \n" ".p2align 2 \n" "29: \n" - "addl $0x1,%2 \n" + "addl $0x1,%5 \n" "jl 99f \n" - "movzwl (%1,%3,1),%k5 \n" - "movd %k5,%%xmm0 \n" + "movzwl (%1,%3,1),%k2 \n" + "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n" "pxor %%xmm6,%%xmm1 \n" "pmaddubsw %%xmm1,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,%k5 \n" - "mov %b5,(%0) \n" + "movd %%xmm0,%k2 \n" + "mov %b2,(%0) \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 - "+rm"(dst_width), // %2 - "+a"(x0), // %3 - "+d"(x1), // %4 - "+b"(temp_pixel) // %5 + "+a"(temp_pixel), // %2 + "+r"(x0), // %3 + "+r"(x1), // %4 + "+rm"(dst_width) // %5 : "rm"(x), // %6 "rm"(dx) // %7 : "memory", "cc"