diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 72afce84d..1e77ae723 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -162,12 +162,6 @@ int ARGBBlend(const uint8* src_argb, int src_stride_argb, ARGBBlendRow = ARGBBlendRow_SSE2; } #endif -#if defined(HAS_ARGBBLENDROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 2)) { - ARGBBlendRow = ARGBBlendRow_SSSE3; - } -#endif for (int y = 0; y < height; ++y) { ARGBBlendRow(src_argb, dst_argb, width); diff --git a/source/rotate.cc b/source/rotate.cc index ad1078774..310ff4935 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -996,6 +996,7 @@ void RotateUV270(const uint8* src, int src_stride, width, height); } +// Rotate 180 is a horizontal and vertical flip. void RotateUV180(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, diff --git a/source/row.h b/source/row.h index 14bc6dca0..169bf1e60 100644 --- a/source/row.h +++ b/source/row.h @@ -65,7 +65,6 @@ extern "C" { #endif #if defined(_MSC_VER) -#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBBLENDROW_SSE2 #endif @@ -244,7 +243,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, int width); -void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/source/row_common.cc b/source/row_common.cc index 224f7f4f9..9372a9449 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -514,76 +514,6 @@ void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } -#if 0 -void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { - for (int x = 0; x < width - 1; x += 2) { - uint32 f = *(uint32*)src_argb; - uint32 a = f >> 24; - if (a) { - const uint32 b = *(uint32*)dst_argb; - if (a < 255) { - const uint32 src_rb = f & 0x00ff00ff; - const uint32 dst_rb = b & 0x00ff00ff; - const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) & - 0xff00ff00; - - const uint32 src_g = f & 0x0000ff00; - const uint32 dst_g = b & 0x0000ff00; - const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) & - 0x00ff0000); - - f = ((out_rb | out_g) >> 8) | 0xff000000; - } - *(uint32*)dst_argb = f; - } - - f = *(uint32*)(src_argb + 4); - a = f >> 24; - if (a) { - const uint32 b = *(uint32*)(dst_argb + 4); - if (a < 255) { - const uint32 src_rb = f & 0x00ff00ff; - const uint32 dst_rb = b & 0x00ff00ff; - const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) & - 0xff00ff00; - - const uint32 src_g = f & 0x0000ff00; - const uint32 dst_g = b & 0x0000ff00; - const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) & - 0x00ff0000); - - f = ((out_rb | out_g) >> 8) | 0xff000000; - } - *(uint32*)(dst_argb + 4) = f; - } - src_argb += 8; - dst_argb += 8; - } - - if (width & 1) { - uint32 f = *(uint32*)src_argb; - uint32 a = f >> 24; - if (a) { - const uint32 b = *(uint32*)dst_argb; - if (a < 255) { - const uint32 src_rb = f & 0x00ff00ff; - const uint32 dst_rb = b & 0x00ff00ff; - const uint32 out_rb = (src_rb * a + dst_rb * (a ^ 0xff) + 0x00800080) & - 0xff00ff00; - - const uint32 src_g = f & 0x0000ff00; - const uint32 dst_g = b & 0x0000ff00; - const uint32 out_g = ((src_g * a + dst_g * (a ^ 0xff) + 0x00008000) & - 0x00ff0000); - - f = ((out_rb | out_g) >> 8) | 0xff000000; - } - *(uint32*)dst_argb = f; - } - } -} -#endif - // Wrappers to handle odd sizes/alignments #define MAKEYUVANY(NAMEANY, NAME, COPYROW) \ void NAMEANY(const uint8* y_buf, \ diff --git a/source/row_win.cc b/source/row_win.cc index 519edbb36..62c3b0c88 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1909,75 +1909,55 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, } #endif // HAS_YUY2TOYROW_SSE2 - -#ifdef HAS_ARGBBLENDROW_SSSE3 -// Shuffle table for copying alpha -static const uvec8 kShuffleAlpha = { - 7u, 7u, 7u, 7u, 7u, 7u, 0x80, 0x80, 15u, 15u, 15u, 15u, 15u, 15u, 0x80, 0x80 -}; - -__declspec(naked) -void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - __asm { - mov eax, 0x00200020 // rounding constant for 8.6 fixed point - movd xmm3, eax - pshufd xmm3, xmm3, 0 - mov eax, 0x3f3f3f3f // mask for alpha - movd xmm7, eax - pshufd xmm7, xmm7, 0 - movdqa xmm4, kShuffleAlpha - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 - pcmpeqb xmm6, xmm6 // generate 0x00010001 for negating - psrlw xmm6, 15 - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax - - convertloop: - movq xmm0, qword ptr [eax] // fetch 2 pixels - movq xmm1, qword ptr [eax + edx] - punpcklbw xmm1, xmm0 // mix 2 pixels aArRgGbB_aArRgGbB - movdqa xmm2, xmm1 // alpha from byte 7 and 15 - pshufb xmm2, xmm4 - pxor xmm2, xmm5 - psrlw xmm2, 2 - pand xmm2, xmm7 - paddw xmm2, xmm6 // -a = (a^255)+1 - pmaddubsw xmm1, xmm2 - paddw xmm1, xmm3 // round - psrlw xmm1, 6 - - packuswb xmm1, xmm1 // pack 2 pixels - sub ecx, 2 - movq qword ptr [eax + edx], xmm1 - lea eax, [eax + 8] - ja convertloop - - ret - } -} -#endif // HAS_ARGBBLENDROW_SSSE3 - #ifdef HAS_ARGBBLENDROW_SSE2 // TODO(fbarchard): Single multiply method b+a(f-b) // TODO(fbarchard): Unroll and pair -// TODO(fbarchard): Test for transparent and opaque common cases +// TODO(fbarchard): Port to gcc __declspec(naked) void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { + push esi + mov esi, [esp + 4 + 4] // src_argb + mov edx, [esp + 4 + 8] // dst_argb + mov ecx, [esp + 4 + 12] // width pcmpeqb xmm4, xmm4 // generate 0xffffffff do negative alpha - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - sub edx, eax + sub edx, esi + + mov eax, [esi] // get first pixel + sub ecx, 1 // ensure there are at least 2 pixels + je last1 // last pixel? + cmp eax, 0xFF000000 // opaque? + jae opaqueloop + cmp eax, 0x00FFFFFF // translucient? + ja translucientloop + + align 16 + transparentloop: + sub ecx, 1 + lea esi, [esi + 4] + je last1 + mov eax, [esi] // handle remaining pixel + cmp eax, 0x00FFFFFF // transparent? + jbe transparentloop + cmp eax, 0xFF000000 // translucient? + jb translucientloop + + align 16 + opaqueloop: + mov dword ptr [esi + edx], eax + lea esi, [esi + 4] sub ecx, 1 je last1 + mov eax, [esi] // handle remaining pixel + cmp eax, 0xFF000000 // opaque? + jae opaqueloop + cmp eax, 0x00FFFFFF // transparent? + jbe transparentloop - convertloop: - movq xmm0, qword ptr [eax] // fetch 2 pixels - movq xmm1, qword ptr [eax + edx] + align 4 + translucientloop: + movq xmm0, qword ptr [esi] // fetch 2 pixels + movq xmm1, qword ptr [esi + edx] punpcklbw xmm0, xmm0 // src 16 bits punpcklbw xmm1, xmm1 // dst 16 bits pshuflw xmm2, xmm0, 0xff // src alpha @@ -1989,19 +1969,25 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { paddw xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 // pack 2 pixels + movq qword ptr [esi + edx], xmm0 + lea esi, [esi + 8] sub ecx, 2 - movq qword ptr [eax + edx], xmm0 - lea eax, [eax + 8] - ja convertloop + jbe last1 + mov eax, [esi] // handle remaining pixel + cmp eax, 0x00FFFFFF // transparent? + jbe transparentloop + cmp eax, 0xFF000000 // translucient? + jb translucientloop + jmp opaqueloop + align 4 last1: add ecx, 1 je done - mov ecx, [eax] // handle remaining pixel - movd xmm0, ecx - mov ecx, [eax + edx] - movd xmm1, ecx + movd xmm0, eax + mov eax, [esi + edx] + movd xmm1, eax punpcklbw xmm0, xmm0 // src 16 bits punpcklbw xmm1, xmm1 // dst 16 bits pshuflw xmm2, xmm0, 0xff // src alpha @@ -2012,17 +1998,16 @@ void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pmulhuw xmm1, xmm3 // dst * (a ^ 0xffff) paddw xmm0, xmm1 psrlw xmm0, 8 - packuswb xmm0, xmm0 // pack 2 pixels - - movd ecx, xmm0 - mov dword ptr [eax + edx], ecx + packuswb xmm0, xmm0 // pack to bytes + movd eax, xmm0 + mov dword ptr [esi + edx], eax done: - + pop esi ret } } -#endif // HAS_ARGBBLENDROW_SSSE3 +#endif // HAS_ARGBBLENDROW_SSE2 #endif // _M_IX86 diff --git a/source/scale.cc b/source/scale.cc index 5f8f05a95..3c7bfe9ea 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1702,18 +1702,18 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, intptr_t tmp_src_stride = static_cast(src_stride); asm volatile ( "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%6 \n" + "sub $0x1,%5 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "mov %0,%3 \n" - "add %4,%0 \n" + "add %6,%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm4,%%xmm0 \n" "punpckhbw %%xmm4,%%xmm1 \n" - "mov %6,%2 \n" + "mov %5,%2 \n" "2: \n" "movdqa (%0),%%xmm2 \n" - "add %4,%0 \n" + "add %6,%0 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" "punpckhbw %%xmm4,%%xmm3 \n" @@ -1725,16 +1725,15 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, "movdqa %%xmm1,0x10(%1) \n" "lea 0x10(%3),%0 \n" "lea 0x20(%1),%1 \n" - "sub $0x10,%5 \n" + "sub $0x10,%4 \n" "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_height), // %2 "+r"(tmp_src), // %3 - "+rm"(tmp_src_stride), // %4 - "+rm"(src_width), // %5 - "+rm"(src_height) // %6 - : + "+rm"(src_width), // %4 + "+rm"(src_height) // %5 + : "+rm"(tmp_src_stride), // %6 : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"