diff --git a/README.chromium b/README.chromium index 5c5bb7a1e..541fce33a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1162 +Version: 1163 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9435ddf76..941be842c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -205,6 +205,7 @@ extern "C" { #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 #endif // The following are require VS2012. @@ -218,10 +219,7 @@ extern "C" { #define HAS_I422TOABGRROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_MIRRORROW_AVX2 - -// Effects: #define HAS_ARGBMIRRORROW_AVX2 -#define HAS_ARGBUNATTENUATEROW_AVX2 #endif // defined(VISUALC_HAS_AVX2) // The following are Yasm x86 only: diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 08e24ed45..921eb9d5e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1162 +#define LIBYUV_VERSION 1163 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 2f272bc61..77047a28c 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3379,7 +3379,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "jge 40b \n" "jmp 49f \n" - // 4 pixel unaligned loop. + // 4 pixel loop. LABELALIGN "41: \n" "movdqu " MEMACCESS(0) ",%%xmm3 \n" @@ -3449,7 +3449,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. -// aligned to 16 bytes void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm4,%%xmm4 \n" @@ -3497,14 +3496,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha static uvec8 kShuffleAlpha0 = { - 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u }; static uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u }; // Attenuate 4 pixels at a time. -// aligned to 16 bytes void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm3,%%xmm3 \n" @@ -3551,9 +3549,8 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const ulvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u +static const uvec8 kShuffleAlpha_AVX2 = { + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u }; // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -3597,7 +3594,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -// aligned to 16 bytes void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { uintptr_t alpha = 0; @@ -3647,6 +3643,80 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBUNATTENUATEROW_SSE2 +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u +}; +// Unattenuate 8 pixels at a time. +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + "sub %0,%1 \n" + "vbroadcastf128 %5,%%ymm5 \n" + + // 8 pixel loop. + LABELALIGN + "1: \n" + // replace VPGATHER + "movzb " MEMACCESS2(0x03,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 + "movzb " MEMACCESS2(0x13,0) ",%3 \n" + "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0 + "movzb " MEMACCESS2(0x17,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1 + "movzb " MEMACCESS2(0x1b,0) ",%3 \n" + "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2 + "movzb " MEMACCESS2(0x1f,0) ",%3 \n" + MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3 + "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" + "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" + "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" + "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" + // end of VPGATHER + + "vmovdqu " MEMACCESS(0) ",%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" + "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" + "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" + "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "sub $0x8,%2 \n" + MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1) + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8), // %4 + "m"(kUnattenShuffleAlpha_AVX2) // %5 + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" +#endif + ); +} +#endif // HAS_ARGBUNATTENUATEROW_AVX2 + #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { @@ -3841,7 +3911,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -// aligned to 16 bytes void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile ( @@ -3894,7 +3963,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -// Aligned to 16 bytes. void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( diff --git a/source/row_win.cc b/source/row_win.cc index 6c3e43ef5..d07dc620e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -72,7 +72,6 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; // 64 bit #if defined(_M_X64) -// Aligned destination version. __declspec(align(16)) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -165,7 +164,7 @@ static const lvec32 kPermdARGBToY_AVX = { // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, - 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, + 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 }; // Constants for BGRA. @@ -1845,7 +1844,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm packuswb xmm2, xmm2 /* R */ \ } -// 8 pixels, dest aligned 16. +// 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I444ToARGBRow_SSSE3(const uint8* y_buf, @@ -1888,7 +1887,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I422ToRGB24Row_SSSE3(const uint8* y_buf, @@ -1935,7 +1934,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I422ToRAWRow_SSSE3(const uint8* y_buf, @@ -2055,7 +2054,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void I422ToARGBRow_SSSE3(const uint8* y_buf, @@ -2098,7 +2097,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // Similar to I420 but duplicate UV once more. __declspec(naked) __declspec(align(16)) @@ -2144,7 +2143,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void NV12ToARGBRow_SSSE3(const uint8* y_buf, @@ -2182,7 +2181,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, } } -// 8 pixels, dest aligned 16. +// 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void NV21ToARGBRow_SSSE3(const uint8* y_buf, @@ -2423,8 +2422,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORROW_AVX2 // Shuffle table for reversing the bytes. -static const ulvec8 kShuffleMirror_AVX2 = { - 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, +static const uvec8 kShuffleMirror_AVX2 = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; @@ -2434,7 +2432,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width - vmovdqa ymm5, kShuffleMirror_AVX2 + vbroadcastf128 ymm5, kShuffleMirror_AVX2 lea eax, [eax - 32] align 4 @@ -3711,7 +3709,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -3805,8 +3802,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha_AVX2 = { - 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u + 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u }; __declspec(naked) __declspec(align(16)) void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -3846,7 +3842,6 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { @@ -3896,7 +3891,7 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. -static const ulvec8 kUnattenShuffleAlpha_AVX2 = { +static const uvec8 kUnattenShuffleAlpha_AVX2 = { 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u }; // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. @@ -4185,7 +4180,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { @@ -4232,7 +4226,6 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -// Aligned to 16 bytes. __declspec(naked) __declspec(align(16)) void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { @@ -4738,8 +4731,7 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // area is the number of pixels in the area being averaged. // dst points to pixel to store result to. // count is number of averaged pixels to produce. -// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte -// aligned. +// Does 4 pixels at a time. void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count) {