diff --git a/README.chromium b/README.chromium index 82e8db96e..fc0a93c32 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 802 +Version: 803 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index dc2db70f4..d161e3db4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 802 +#define LIBYUV_VERSION 803 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index e3fa64a96..ff2718853 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -5880,14 +5880,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { asm volatile ( - "vmovdqu "MEMACCESS(3)",%%xmm4 \n" - "vmovdqu "MEMACCESS2(0x10,3)",%%xmm5 \n" - "vmovdqu "MEMACCESS2(0x20,3)",%%xmm6 \n" - "vmovdqu "MEMACCESS2(0x30,3)",%%xmm7 \n" - "vpermq $0x44,%%ymm4,%%ymm4 \n" - "vpermq $0x44,%%ymm5,%%ymm5 \n" - "vpermq $0x44,%%ymm6,%%ymm6 \n" - "vpermq $0x44,%%ymm7,%%ymm7 \n" + "vbroadcastf128 "MEMACCESS(3)",%%ymm4 \n" + "vbroadcastf128 "MEMACCESS2(0x10,3)",%%ymm5 \n" + "vbroadcastf128 "MEMACCESS2(0x20,3)",%%ymm6 \n" + "vbroadcastf128 "MEMACCESS2(0x30,3)",%%ymm7 \n" // 2 pixel loop. ".p2align 4 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 57d8de5c6..316f29df6 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -30,16 +30,6 @@ static const vec8 kARGBToYJ = { 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 }; -static const lvec8 kARGBToY_AVX = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; - -static const lvec8 kARGBToYJ_AVX = { - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, - 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 -}; - static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; @@ -48,12 +38,6 @@ static const vec8 kARGBToUJ = { 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 }; -// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version. -static const lvec8 kARGBToU_AVX = { - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, - 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 -}; - static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; @@ -62,13 +46,8 @@ static const vec8 kARGBToVJ = { -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 }; -static const lvec8 kARGBToV_AVX = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 -}; - // vpermd for vphaddw + vpackuswb vpermd. -static const lvec32 kShufARGBToY_AVX = { +static const lvec32 kPermdARGBToY_AVX = { 0, 4, 1, 5, 2, 6, 3, 7 }; @@ -124,16 +103,6 @@ static const uvec8 kAddY16 = { static const vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; -static const lvec16 kAddYJ64_AVX = { - 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 -}; - -static const ulvec8 kAddY16_AVX = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, @@ -144,13 +113,6 @@ static const uvec16 kAddUVJ128 = { 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u }; -static const ulvec8 kAddUV128_AVX = { - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, - 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u -}; - // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u @@ -737,9 +699,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - vmovdqa ymm6, kShufARGBToY_AVX - vmovdqa ymm5, kAddY16_AVX - vmovdqa ymm4, kARGBToY_AVX + vbroadcastf128 ymm4, kARGBToY + vbroadcastf128 ymm5, kAddY16 + vmovdqa ymm6, kPermdARGBToY_AVX align 16 convertloop: @@ -777,9 +739,9 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - vmovdqa ymm4, kARGBToYJ_AVX - vmovdqa ymm5, kAddYJ64_AVX - vmovdqa ymm6, kShufARGBToY_AVX + vbroadcastf128 ymm4, kARGBToYJ + vbroadcastf128 ymm5, kAddYJ64 + vmovdqa ymm6, kPermdARGBToY_AVX align 16 convertloop: @@ -1229,9 +1191,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // pix - vmovdqa ymm7, kARGBToU_AVX - vmovdqa ymm6, kARGBToV_AVX - vmovdqa ymm5, kAddUV128_AVX + vbroadcastf128 ymm5, kAddUV128 + vbroadcastf128 ymm6, kARGBToV + vbroadcastf128 ymm7, kARGBToU sub edi, edx // stride from u to v align 16 @@ -6640,8 +6602,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_bayer mov ecx, [esp + 12] // shuffler - vmovdqa xmm5, [ecx] - vpermq ymm5, ymm5, 0x44 // same shuffle in high as low. + vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // pix align 16 @@ -6825,18 +6786,13 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { __asm { - mov eax, [esp + 12] /* poly */ - vmovdqu xmm4, [eax] // C0 - vmovdqu xmm5, [eax + 16] // C1 - vmovdqu xmm6, [eax + 32] // C2 - vmovdqu xmm7, [eax + 48] // C3 - vpermq ymm4, ymm4, 0x44 // dup low qwords to high qwords - vpermq ymm5, ymm5, 0x44 - vpermq ymm6, ymm6, 0x44 - vpermq ymm7, ymm7, 0x44 - mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* poly */ + vbroadcastf128 ymm4, [ecx] // C0 + vbroadcastf128 ymm5, [ecx + 16] // C1 + vbroadcastf128 ymm6, [ecx + 32] // C2 + vbroadcastf128 ymm7, [ecx + 48] // C3 mov ecx, [esp + 16] /* width */ // 2 pixel loop.