diff --git a/README.chromium b/README.chromium index 2c66b46e3..08dcf5119 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1465 +Version: 1466 License: BSD License File: LICENSE diff --git a/include/libyuv/compare_row.h b/include/libyuv/compare_row.h new file mode 100644 index 000000000..4562da047 --- /dev/null +++ b/include/libyuv/compare_row.h @@ -0,0 +1,77 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_COMPARE_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) +#define LIBYUV_DISABLE_X86 +#endif + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// clang >= 3.4.0 required for AVX2. +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) +#define CLANG_HAS_AVX2 1 +#endif // clang >= 3.4 +#endif // __clang__ + +#if defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) +#define HAS_HASHDJB2_AVX2 +#endif + +// The following are available for Visual C and GCC: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || (defined(__i386__) || defined(_M_IX86))) +#define HAS_HASHDJB2_SSE41 +#define HAS_SUMSQUAREERROR_SSE2 +#endif + +// The following are available for Visual C and clangcl 32 bit: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) +#define HAS_HASHDJB2_AVX2 +#define HAS_SUMSQUAREERROR_AVX2 +#endif + +// The following are available for Neon: +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SUMSQUAREERROR_NEON +#endif + +uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); +uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); +uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); +uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); + +uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); +uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); +uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ NOLINT diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7cea578c5..2285c3416 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -365,7 +365,7 @@ extern "C" { #endif #endif -#if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) +#if defined(_MSC_VER) && !defined(__CLR_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var #define SIMD_ALIGNED32(var) __declspec(align(64)) var typedef __declspec(align(16)) int16 vec16[8]; @@ -380,7 +380,7 @@ typedef __declspec(align(32)) int8 lvec8[32]; typedef __declspec(align(32)) uint16 ulvec16[16]; typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; -#elif defined(__GNUC__) || defined(__clang__) +#elif defined(__GNUC__) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #define SIMD_ALIGNED32(var) var __attribute__((aligned(64))) diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5f2156bf3..7fec0c728 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1465 +#define LIBYUV_VERSION 1466 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare.cc b/source/compare.cc index e7d311138..e3846bdfd 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -17,6 +17,7 @@ #endif #include "libyuv/basic_types.h" +#include "libyuv/compare_row.h" #include "libyuv/cpu_id.h" #include "libyuv/row.h" #include "libyuv/video_common.h" @@ -26,30 +27,13 @@ namespace libyuv { extern "C" { #endif -// hash seed of 5381 recommended. -// Internal C version of HashDjb2 with int sized count for efficiency. -uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); - -// This module is for Visual C x86 -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || \ - (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))) -#define HAS_HASHDJB2_SSE41 -uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); - -#if defined(_M_IX86) && (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) -#define HAS_HASHDJB2_AVX2 -uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); -#endif - -#endif // HAS_HASHDJB2_SSE41 - // hash seed of 5381 recommended. LIBYUV_API uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; - uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C; + uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = + HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { HashDjb2_SSE = HashDjb2_SSE41; @@ -127,23 +111,6 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) { return fourcc; } -uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count); -#if !defined(LIBYUV_DISABLE_NEON) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_SUMSQUAREERROR_NEON -uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); -#endif -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#define HAS_SUMSQUAREERROR_SSE2 -uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); -#endif - -#ifdef VISUALC_HAS_AVX2 -#define HAS_SUMSQUAREERROR_AVX2 -uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); -#endif - // TODO(fbarchard): Refactor into row function. LIBYUV_API uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, diff --git a/source/compare_common.cc b/source/compare_common.cc index c546b5182..42fc58935 100644 --- a/source/compare_common.cc +++ b/source/compare_common.cc @@ -10,6 +10,8 @@ #include "libyuv/basic_types.h" +#include "libyuv/compare_row.h" + #ifdef __cplusplus namespace libyuv { extern "C" { diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index cac2fa03b..0c5cee0d5 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 5b27407ef..49aa3b4ee 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index b3d55c422..f9c7df98c 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus diff --git a/source/compare_win.cc b/source/compare_win.cc index 1043b6313..dc86fe25b 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -9,6 +9,8 @@ */ #include "libyuv/basic_types.h" + +#include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -133,28 +135,28 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { movd xmm0, [esp + 12] // seed pxor xmm7, xmm7 // constant 0 for unpck - movdqa xmm6, kHash16x33 + movdqa xmm6, xmmword ptr kHash16x33 wloop: movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] pmulld xmm0, xmm6 // hash *= 33 ^ 16 - movdqa xmm5, kHashMul0 + movdqa xmm5, xmmword ptr kHashMul0 movdqa xmm2, xmm1 punpcklbw xmm2, xmm7 // src[0-7] movdqa xmm3, xmm2 punpcklwd xmm3, xmm7 // src[0-3] pmulld xmm3, xmm5 - movdqa xmm5, kHashMul1 + movdqa xmm5, xmmword ptr kHashMul1 movdqa xmm4, xmm2 punpckhwd xmm4, xmm7 // src[4-7] pmulld xmm4, xmm5 - movdqa xmm5, kHashMul2 + movdqa xmm5, xmmword ptr kHashMul2 punpckhbw xmm1, xmm7 // src[8-15] movdqa xmm2, xmm1 punpcklwd xmm2, xmm7 // src[8-11] pmulld xmm2, xmm5 - movdqa xmm5, kHashMul3 + movdqa xmm5, xmmword ptr kHashMul3 punpckhwd xmm1, xmm7 // src[12-15] pmulld xmm1, xmm5 paddd xmm3, xmm4 // add 16 results @@ -181,32 +183,32 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count - movd xmm0, [esp + 12] // seed - movdqa xmm6, kHash16x33 + vmovd xmm0, [esp + 12] // seed wloop: vpmovzxbd xmm3, [eax] // src[0-3] - pmulld xmm0, xmm6 // hash *= 33 ^ 16 + vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 vpmovzxbd xmm4, [eax + 4] // src[4-7] - pmulld xmm3, kHashMul0 + vpmulld xmm3, xmm3, xmmword ptr kHashMul0 vpmovzxbd xmm2, [eax + 8] // src[8-11] - pmulld xmm4, kHashMul1 + vpmulld xmm4, xmm4, xmmword ptr kHashMul1 vpmovzxbd xmm1, [eax + 12] // src[12-15] - pmulld xmm2, kHashMul2 + vpmulld xmm2, xmm2, xmmword ptr kHashMul2 lea eax, [eax + 16] - pmulld xmm1, kHashMul3 - paddd xmm3, xmm4 // add 16 results - paddd xmm1, xmm2 - paddd xmm1, xmm3 - pshufd xmm2, xmm1, 0x0e // upper 2 dwords - paddd xmm1, xmm2 - pshufd xmm2, xmm1, 0x01 - paddd xmm1, xmm2 - paddd xmm0, xmm1 + vpmulld xmm1, xmm1, xmmword ptr kHashMul3 + vpaddd xmm3, xmm3, xmm4 // add 16 results + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm1, xmm1, xmm3 + vpshufd xmm2, xmm1, 0x0e // upper 2 dwords + vpaddd xmm1, xmm1,xmm2 + vpshufd xmm2, xmm1, 0x01 + vpaddd xmm1, xmm1, xmm2 + vpaddd xmm0, xmm0, xmm1 sub ecx, 16 jg wloop - movd eax, xmm0 // return hash + vmovd eax, xmm0 // return hash + vzeroupper ret } } diff --git a/source/row_any.cc b/source/row_any.cc index 2ef5359bd..9ce269a0f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -245,18 +245,6 @@ ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) #ifdef HAS_COPYROW_NEON ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) #endif -#ifdef HAS_ARGBCOPYALPHAROW_AVX2 -ANY11(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 1, 4, 15) -#endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -ANY11(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 1, 4, 7) -#endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 -ANY11(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) -#endif -#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 -ANY11(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) -#endif #if defined(HAS_ARGBTORGB24ROW_SSSE3) ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) @@ -410,6 +398,36 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif #undef ANY11 +// Any 1 to 1 blended. +#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128 * 2); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 128, dst_ptr + n * BPP, r * BPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ + } + +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 +ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 +ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) +#endif +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 +ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) +#endif +#undef ANY11B + // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index cc814fd44..138ba9667 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -517,7 +517,7 @@ TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1, 0, ARGB, 4) #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ + YALIGN, W1280, DIFF, N, NEG, OFF) \ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ @@ -558,21 +558,10 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ kWidth, NEG kHeight); \ } \ int max_diff = 0; \ - /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ - align_buffer_64(dst_argb32_c, kWidth * BPP_C * kHeight); \ - align_buffer_64(dst_argb32_opt, kWidth * BPP_C * kHeight); \ - memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \ - memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \ - FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, \ - dst_argb32_c, kWidth * BPP_C , \ - kWidth, kHeight); \ - FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, \ - dst_argb32_opt, kWidth * BPP_C , \ - kWidth, kHeight); \ - for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ int abs_diff = \ - abs(static_cast(dst_argb32_c[i]) - \ - static_cast(dst_argb32_opt[i])); \ + abs(static_cast(dst_argb_c[i + OFF]) - \ + static_cast(dst_argb_opt[i + OFF])); \ if (abs_diff > max_diff) { \ max_diff = abs_diff; \ } \ @@ -584,22 +573,20 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ free_aligned_buffer_64(src_a); \ free_aligned_buffer_64(dst_argb_c); \ free_aligned_buffer_64(dst_argb_opt); \ - free_aligned_buffer_64(dst_argb32_c); \ - free_aligned_buffer_64(dst_argb32_opt); \ } #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, DIFF, FMT_C, BPP_C) \ + YALIGN, DIFF) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0, FMT_C, BPP_C) \ + YALIGN, benchmark_width_ - 4, DIFF, _Any, +, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1, FMT_C, BPP_C) \ + YALIGN, benchmark_width_, DIFF, _Unaligned, +, 1) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Invert, -, 0, FMT_C, BPP_C) \ + YALIGN, benchmark_width_, DIFF, _Invert, -, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) + YALIGN, benchmark_width_, DIFF, _Opt, +, 0) -TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2, ARGB, 4) +TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ W1280, DIFF, N, NEG, OFF) \