diff --git a/source/cpu_id.cc b/source/cpu_id.cc index 163da1014..e58a42cd0 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -10,7 +10,7 @@ #include "libyuv/cpu_id.h" -#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +#if defined(_MSC_VER) && !defined(__clang__) #include // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ @@ -207,8 +207,8 @@ int InitCpuFlags(void) { #ifdef HAS_XGETBV // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv - if ((cpu_info1[2] & 0x1c000000) == 0x1c000000 && // AVX and OSXSave - (GetXCR0() & 6) == 6) { // Test OD saves YMM registers + if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave + ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX; // Detect AVX512bw diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 459df6fe0..c73fa8ab4 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -620,7 +620,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0, #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { // BlendPlaneRow = BlendPlaneRow_Any_AVX2; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } } @@ -688,7 +688,7 @@ int I420Blend(const uint8* src_y0, int src_stride_y0, #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { // BlendPlaneRow = BlendPlaneRow_Any_AVX2; - if (IS_ALIGNED(halfwidth, 16)) { + if (IS_ALIGNED(halfwidth, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 12c7dd884..d406c7c77 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3469,7 +3469,10 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_BLENDPLANEROW_SSSE3 // Blend 8 pixels at a time. -// =((G2*C2)+(H2*(D2))+32768+127)/256 +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, const uint8* alpha, uint8* dst, int width) { asm volatile ( @@ -3514,8 +3517,11 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, #endif // HAS_BLENDPLANEROW_SSSE3 #ifdef HAS_BLENDPLANEROW_AVX2 -// Blend 16 pixels at a time. -// =((G2*C2)+(H2*(D2))+32768+127)/256 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, const uint8* alpha, uint8* dst, int width) { asm volatile ( @@ -3531,27 +3537,30 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, "sub %2,%1 \n" "sub %2,%3 \n" - // 16 pixel loop. + // 32 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%2),%%xmm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu (%2),%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpxor %%ymm5,%%ymm3,%%ymm3 \n" "vpxor %%ymm5,%%ymm0,%%ymm0 \n" - "vmovdqu (%0,%2,1),%%xmm1 \n" - "vmovdqu (%1,%2,1),%%xmm2 \n" - "vpermq $0xd8,%%ymm1,%%ymm1 \n" - "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vmovdqu (%0,%2,1),%%ymm1 \n" + "vmovdqu (%1,%2,1),%%ymm2 \n" + "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" + "vpsubb %%ymm6,%%ymm1,%%ymm4 \n" "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" + "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vmovdqu %%xmm0,(%3,%2,1) \n" - "lea 0x10(%2),%2 \n" - "sub $0x10,%4 \n" + "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%3,%2,1) \n" + "lea 0x20(%2),%2 \n" + "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src0), // %0 @@ -3559,7 +3568,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, "+r"(alpha), // %2 "+r"(dst), // %3 "+r"(width) // %4 - :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" + :: "memory", "cc", "eax", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_BLENDPLANEROW_AVX2 diff --git a/source/row_win.cc b/source/row_win.cc index 13076ce60..68f37f317 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4065,7 +4065,10 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, #ifdef HAS_BLENDPLANEROW_SSSE3 // Blend 8 pixels at a time. -// =((G2*C2)+(H2*(D2))+32768+127)/256 +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, const uint8* alpha, uint8* dst, int width) { @@ -4116,8 +4119,11 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, #endif // HAS_BLENDPLANEROW_SSSE3 #ifdef HAS_BLENDPLANEROW_AVX2 -// Blend 16 pixels at a time. -// =((G2*C2)+(H2*(D2))+32768+127)/256 +// Blend 32 pixels at a time. +// unsigned version of math +// =((A2*C2)+(B2*(255-C2))+255)/256 +// signed version of math +// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, const uint8* alpha, uint8* dst, int width) { @@ -4141,27 +4147,30 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, sub edx, esi sub edi, esi - // 16 pixel loop. - convertloop16: - vmovdqu xmm0, [esi] // alpha - vpermq ymm0, ymm0, 0xd8 - vpunpcklbw ymm0, ymm0, ymm0 + // 32 pixel loop. + convertloop32: + vmovdqu ymm0, [esi] // alpha + vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 + vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 + vpxor ymm3, ymm3, ymm5 // a, 255-a vpxor ymm0, ymm0, ymm5 // a, 255-a - vmovdqu xmm1, [eax + esi] // src0 - vmovdqu xmm2, [edx + esi] // src1 - vpermq ymm1, ymm1, 0xd8 - vpermq ymm2, ymm2, 0xd8 + vmovdqu ymm1, [eax + esi] // src0 + vmovdqu ymm2, [edx + esi] // src1 + vpunpckhbw ymm4, ymm1, ymm2 vpunpcklbw ymm1, ymm1, ymm2 + vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 + vpmaddubsw ymm3, ymm3, ymm4 vpmaddubsw ymm0, ymm0, ymm1 + vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. + vpsrlw ymm3, ymm3, 8 vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 - vmovdqu [edi + esi], xmm0 - lea esi, [esi + 16] - sub ecx, 16 - jg convertloop16 + vpackuswb ymm0, ymm0, ymm3 + vmovdqu [edi + esi], ymm0 + lea esi, [esi + 32] + sub ecx, 32 + jg convertloop32 pop edi pop esi