diff --git a/README.chromium b/README.chromium index ebdbce09b..4dd7bce77 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 779 +Version: 780 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index e898381c5..9bafdb42f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -142,8 +142,8 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Effects: #define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_RGBCOLORTABLEROW_X86 #define HAS_ARGBPOLYNOMIALROW_SSE2 +#define HAS_RGBCOLORTABLEROW_X86 // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 @@ -153,6 +153,7 @@ extern "C" { #define HAS_ARGBTOYROW_AVX2 #define HAS_HALFROW_AVX2 #define HAS_I422TOARGBROW_AVX2 +#define HAS_INTERPOLATEROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_SPLITUVROW_AVX2 @@ -162,17 +163,17 @@ extern "C" { #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -#define HAS_INTERPOLATEROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 +#define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 -#endif -#endif +#endif // _MSC_VER >= 1700 +#endif // defined(_MSC_VER) // The following are Yasm x86 only: // TODO(fbarchard): Port AVX2 to inline. @@ -1549,10 +1550,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void ARGBPolynomialRow_C(const uint8* src_argb, uint8* dst_argb, const float* poly, int width); - void ARGBPolynomialRow_SSE2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width); +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b0ce4062e..c616579c3 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 779 +#define LIBYUV_VERSION 780 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index da5f0d2df..823dc2ca5 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2055,6 +2055,11 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, if (TestCpuFlag(kCpuHasSSE2)) { ARGBPolynomialRow = ARGBPolynomialRow_SSE2; } +#endif +#if defined(HAS_ARGBPOLYNOMIALROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 2)) { + ARGBPolynomialRow = ARGBPolynomialRow_AVX2; + } #endif for (int y = 0; y < height; ++y) { ARGBPolynomialRow(src_argb, dst_argb, poly, width); diff --git a/source/row_common.cc b/source/row_common.cc index 03d0e5bc0..07128a73b 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2042,10 +2042,10 @@ void ARGBPolynomialRow_C(const uint8* src_argb, dr += poly[14] * r3; da += poly[15] * a3; - dst_argb[0]= Clamp(static_cast(db)); - dst_argb[1]= Clamp(static_cast(dg)); - dst_argb[2]= Clamp(static_cast(dr)); - dst_argb[3]= Clamp(static_cast(da)); + dst_argb[0] = Clamp(static_cast(db)); + dst_argb[1] = Clamp(static_cast(dg)); + dst_argb[2] = Clamp(static_cast(dr)); + dst_argb[3] = Clamp(static_cast(da)); src_argb += 4; dst_argb += 4; } diff --git a/source/row_win.cc b/source/row_win.cc index af87a3d1f..db09f95a7 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6783,7 +6783,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 16] /* width */ - pxor xmm3, xmm3 // 4 bytes to 4 ints + pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. align 16 convertloop: @@ -6814,6 +6814,66 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_AVX2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + mov eax, [esp + 12] /* poly */ + vmovdqu xmm4, [eax] + vmovdqu xmm5, [eax + 16] + vmovdqu xmm6, [eax + 32] + vmovdqu xmm7, [eax + 48] + vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords + vpermq ymm5, ymm5, 0x44 + vpermq ymm6, ymm6, 0x44 + vpermq ymm7, ymm7, 0x44 + + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 16] /* width */ + vpxor ymm3, ymm3, ymm3 // 0 constant for zero extending bytes to ints. + + align 16 + convertloop: + vmovq xmm0, qword ptr [eax] // 2 BGRA pixels + lea eax, [eax + 8] + +// vpmovzxbd ymm0, ymm0 +// TODO(fbarchard): Consider vex256 to avoid vpermq. + vpunpcklbw xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpunpcklwd ymm0, ymm0, ymm3 // b000g000_r000a000_b000g000_r000a000 + + vcvtdq2ps ymm0, ymm0 // 8 floats + vmovdqa ymm1, ymm0 // X + vmulps ymm0, ymm0, ymm5 // C1 * X + vaddps ymm0, ymm0, ymm4 // result = C0 + C1 * X + vmovdqa ymm2, ymm1 + vmulps ymm2, ymm2, ymm1 // X * X + vmulps ymm1, ymm1, ymm2 // X * X * X + vmulps ymm2, ymm2, ymm6 // C2 * X * X + vmulps ymm1, ymm1, ymm7 // C3 * X * X * X + vaddps ymm0, ymm0, ymm2 // result += C2 * X * X + vaddps ymm0, ymm0, ymm1 // result += C3 * X * X * X + vcvttps2dq ymm0, ymm0 + +// vpmovzxdb ymm0, ymm0 // b000g000_r000a000_b000g000_r000a000 + vpackusdw ymm0, ymm0, ymm3 // b0g0r0a0_00000000_b0g0r0a0_00000000 + vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 + vpackuswb xmm0, xmm0, xmm3 // b0g0r0a0_b0g0r0a0_00000000_00000000 + + sub ecx, 2 + vmovq qword ptr [edx], xmm0 + lea edx, [edx + 8] + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 8683592fe..6baecefe0 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1665,7 +1665,7 @@ TEST_F(libyuvTest, TestARGBPolynomial) { 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x - }; + }; // Test blue orig_pixels[0][0] = 255u; diff --git a/util/psnr.h b/util/psnr.h index 2cd0b1457..370337a75 100644 --- a/util/psnr.h +++ b/util/psnr.h @@ -10,7 +10,7 @@ // Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format -#ifndef UTIL_PSNR_H_ +#ifndef UTIL_PSNR_H_ // NOLINT #define UTIL_PSNR_H_ #ifdef __cplusplus @@ -36,4 +36,4 @@ double ComputeSumSquareError(const uint8* org, const uint8* rec, int size); } // extern "C" #endif -#endif // UTIL_PSNR_H_ +#endif // UTIL_PSNR_H_ // NOLINT diff --git a/util/ssim.cc b/util/ssim.cc index 277561dd0..d07889a8a 100644 --- a/util/ssim.cc +++ b/util/ssim.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "./ssim.h" +#include "../util/ssim.h" // NOLINT #include #include diff --git a/util/ssim.h b/util/ssim.h index 0689276ad..40120b4f4 100644 --- a/util/ssim.h +++ b/util/ssim.h @@ -10,7 +10,7 @@ // Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format -#ifndef UTIL_SSIM_H_ +#ifndef UTIL_SSIM_H_ // NOLINT #define UTIL_SSIM_H_ #ifdef __cplusplus @@ -32,4 +32,4 @@ double CalcLSSIM(double ssim); } // extern "C" #endif -#endif // UTIL_SSIM_H_ +#endif // UTIL_SSIM_H_ // NOLINT