diff --git a/README.chromium b/README.chromium index 576dd582c..4191aee72 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1495 +Version: 1496 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ea1323c70..e780f6998 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -121,7 +121,6 @@ extern "C" { #define HAS_H422TOARGBROW_SSSE3 #define HAS_H422TOABGRROW_SSSE3 #define HAS_MERGEUVROW_SSE2 -#define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_UV_SSSE3 #define HAS_MIRRORUVROW_SSSE3 @@ -181,8 +180,7 @@ extern "C" { #define HAS_SOBELYROW_SSE2 #endif -// The following are available on x64 Visual C and clangcl. -// TODO(fbarchard): Port to gcc. +// The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) #define HAS_I422TOARGBROW_SSSE3 @@ -262,16 +260,6 @@ extern "C" { #define HAS_ARGBUNATTENUATEROW_AVX2 #endif -// The following are disabled when SSSE3 is available: -// TODO(fbarchard): remove sse2. ssse3 is faster and well supported. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ - !defined(LIBYUV_SSSE3_ONLY) -#define HAS_ARGBATTENUATEROW_SSE2 -#define HAS_ARGBBLENDROW_SSE2 -#define HAS_MIRRORROW_SSE2 -#endif - // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) @@ -822,7 +810,6 @@ void ARGBToUVJ422Row_C(const uint8* src_argb, void MirrorRow_AVX2(const uint8* src, uint8* dst, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); -void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width); @@ -1620,8 +1607,6 @@ void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); -void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1, - uint8* dst_argb, int width); void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1, uint8* dst_argb, int width); void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1, @@ -1941,7 +1926,6 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y, // Effects related row functions. void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); -void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e55cbcf6d..925617760 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1495 +#define LIBYUV_VERSION 1496 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 51817da16..d205f53fb 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -390,14 +390,6 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, I422AlphaToARGBRow = I422AlphaToARGBRow_MIPS_DSPR2; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSE2; - } - } -#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -424,7 +416,8 @@ int I420AlphaToARGB(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvConstants, width); + I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, &kYuvConstants, + width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } @@ -500,14 +493,6 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, I422AlphaToABGRRow = I422AlphaToABGRRow_MIPS_DSPR2; } #endif -#if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSE2; - } - } -#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; @@ -534,7 +519,8 @@ int I420AlphaToABGR(const uint8* src_y, int src_stride_y, #endif for (y = 0; y < height; ++y) { - I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvConstants, width); + I422AlphaToABGRRow(src_y, src_u, src_v, src_a, dst_abgr, &kYuvConstants, + width); if (attenuate) { ARGBAttenuateRow(dst_abgr, dst_abgr, width); } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2299ab892..55160df1e 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -237,14 +237,6 @@ void MirrorPlane(const uint8* src_y, int src_stride_y, } } #endif -#if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MirrorRow = MirrorRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSE2; - } - } -#endif #if defined(HAS_MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorRow = MirrorRow_Any_SSSE3; @@ -541,11 +533,6 @@ ARGBBlendRow GetARGBBlend() { return ARGBBlendRow; } #endif -#if defined(HAS_ARGBBLENDROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBBlendRow = ARGBBlendRow_SSE2; - } -#endif #if defined(HAS_ARGBBLENDROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBBlendRow = ARGBBlendRow_NEON; @@ -1267,14 +1254,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } -#if defined(HAS_ARGBATTENUATEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBAttenuateRow = ARGBAttenuateRow_SSE2; - } - } -#endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; diff --git a/source/rotate.cc b/source/rotate.cc index dac7113dd..31e04af9c 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -117,14 +117,6 @@ void RotatePlane180(const uint8* src, int src_stride, } } #endif -#if defined(HAS_MIRRORROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - MirrorRow = MirrorRow_Any_SSE2; - if (IS_ALIGNED(width, 16)) { - MirrorRow = MirrorRow_SSE2; - } - } -#endif #if defined(HAS_MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorRow = MirrorRow_Any_SSSE3; diff --git a/source/row_any.cc b/source/row_any.cc index 55c24dc15..7d0821dac 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -443,9 +443,6 @@ ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif -#ifdef HAS_ARGBATTENUATEROW_SSE2 -ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3) -#endif #ifdef HAS_ARGBUNATTENUATEROW_SSE2 ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) #endif @@ -617,9 +614,6 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #ifdef HAS_MIRRORROW_SSSE3 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif -#ifdef HAS_MIRRORROW_SSE2 -ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15) -#endif #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 4aacca3d0..de8769d33 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1679,7 +1679,7 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] -#endif +#endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1711,7 +1711,7 @@ void OMITFP I422AlphaToABGRRow_SSSE3(const uint8* y_buf, [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] -#endif +#endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1987,8 +1987,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \ - "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) " \n" \ - "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \ + "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n" // Store 16 ABGR values. #define STOREABGR_AVX2 \ @@ -1999,8 +1999,8 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpunpcklwd %%ymm2,%%ymm1,%%ymm0 \n" \ "vpunpckhwd %%ymm2,%%ymm1,%%ymm1 \n" \ "vmovdqu %%ymm0," MEMACCESS([dst_abgr]) " \n" \ - "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_abgr]) " \n" \ - "lea " MEMLEA(0x40,[dst_abgr]) ",%[dst_abgr] \n" + "vmovdqu %%ymm1," MEMACCESS2(0x20, [dst_abgr]) " \n" \ + "lea " MEMLEA(0x40, [dst_abgr]) ", %[dst_abgr] \n" #if defined(HAS_I422TOBGRAROW_AVX2) // 16 pixels @@ -2026,7 +2026,6 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" // ARGB first 8 pixels "vpunpckhwd %%ymm1,%%ymm2,%%ymm2 \n" // ARGB next 8 pixels - "vmovdqu %%ymm0," MEMACCESS([dst_bgra]) "\n" "vmovdqu %%ymm2," MEMACCESS2(0x20,[dst_bgra]) "\n" "lea " MEMLEA(0x40,[dst_bgra]) ",%[dst_bgra] \n" @@ -2106,7 +2105,7 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf, [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] -#endif +#endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -2143,7 +2142,7 @@ void OMITFP I422AlphaToABGRRow_AVX2(const uint8* y_buf, [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] -#endif +#endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -2233,7 +2232,6 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, struct YuvConstants* yuvconstants, int width) { - asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN @@ -2263,7 +2261,6 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf, uint8* dst_argb, struct YuvConstants* yuvconstants, int width) { - asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN @@ -2293,7 +2290,6 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf, uint8* dst_argb, struct YuvConstants* yuvconstants, int width) { - asm volatile ( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN @@ -2461,34 +2457,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORROW_SSE2 -void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { - intptr_t temp_width = (intptr_t)(width); - asm volatile ( - LABELALIGN - "1: \n" - MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0 - "movdqa %%xmm0,%%xmm1 \n" - "psllw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "pshuflw $0x1b,%%xmm0,%%xmm0 \n" - "pshufhw $0x1b,%%xmm0,%%xmm0 \n" - "pshufd $0x4e,%%xmm0,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1)",%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", NACL_R14 - "xmm0", "xmm1" - ); -} -#endif // HAS_MIRRORROW_SSE2 - #ifdef HAS_MIRRORROW_UV_SSSE3 // Shuffle table for reversing the bytes of UV channels. static uvec8 kShuffleMirrorUV = { @@ -3333,92 +3301,6 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, } #endif // HAS_YUY2TOYROW_AVX2 -#ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time. -void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm7,%%xmm7 \n" - "psrlw $0xf,%%xmm7 \n" - "pcmpeqb %%xmm6,%%xmm6 \n" - "psrlw $0x8,%%xmm6 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psllw $0x8,%%xmm5 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "sub $0x4,%3 \n" - "jl 49f \n" - - // 4 pixel loop. - LABELALIGN - "41: \n" - "movdqu " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movdqu " MEMACCESS(1) ",%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movdqu " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x10,2) ",%2 \n" - "sub $0x4,%3 \n" - "jge 41b \n" - - "49: \n" - "add $0x3,%3 \n" - "jl 99f \n" - - // 1 pixel loop. - "91: \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 91b \n" - "99: \n" - : "+r"(src_argb0), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} -#endif // HAS_ARGBBLENDROW_SSE2 - #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static uvec8 kShuffleAlpha = { @@ -3427,15 +3309,6 @@ static uvec8 kShuffleAlpha = { }; // Blend 8 pixels at a time -// Shuffle table for reversing the bytes. - -// Same as SSE2, but replaces -// psrlw xmm3, 8 // alpha -// pshufhw xmm3, xmm3,0F5h // 8 alpha words -// pshuflw xmm3, xmm3,0F5h -// with.. -// pshufb xmm3, kShuffleAlpha // alpha - void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -3516,50 +3389,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_ARGBATTENUATEROW_SSE2 -// Attenuate 4 pixels at a time. -void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - asm volatile ( - "pcmpeqb %%xmm4,%%xmm4 \n" - "pslld $0x18,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x8,%%xmm5 \n" - - // 4 pixel loop. - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "pshufhw $0xff,%%xmm0,%%xmm2 \n" - "pshuflw $0xff,%%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "movdqu " MEMACCESS(0) ",%%xmm1 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pshufhw $0xff,%%xmm1,%%xmm2 \n" - "pshuflw $0xff,%%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "lea " MEMLEA(0x10,0) ",%0 \n" - "psrlw $0x8,%%xmm0 \n" - "pand %%xmm4,%%xmm2 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pand %%xmm5,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" - ); -} -#endif // HAS_ARGBATTENUATEROW_SSE2 - #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha static uvec8 kShuffleAlpha0 = { diff --git a/source/row_win.cc b/source/row_win.cc index e5b27b835..6e35c70c6 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3460,32 +3460,6 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORROW_SSE2 -__declspec(naked) -void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // width - - convertloop: - movdqu xmm0, [eax - 16 + ecx] - movdqa xmm1, xmm0 // swap bytes - psllw xmm0, 8 - psrlw xmm1, 8 - por xmm0, xmm1 - pshuflw xmm0, xmm0, 0x1b // swap words - pshufhw xmm0, xmm0, 0x1b - pshufd xmm0, xmm0, 0x4e // swap qwords - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 16 - jg convertloop - ret - } -} -#endif // HAS_MIRRORROW_SSE2 - #ifdef HAS_MIRRORROW_UV_SSSE3 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorUV = { @@ -4382,107 +4356,14 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, } #endif // HAS_YUY2TOYROW_SSE2 -#ifdef HAS_ARGBBLENDROW_SSE2 -// Blend 8 pixels at a time. -__declspec(naked) -void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] // src_argb0 - mov esi, [esp + 4 + 8] // src_argb1 - mov edx, [esp + 4 + 12] // dst_argb - mov ecx, [esp + 4 + 16] // width - pcmpeqb xmm7, xmm7 // generate constant 1 - psrlw xmm7, 15 - pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff - psrlw xmm6, 8 - pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 - psllw xmm5, 8 - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - sub ecx, 4 - jl convertloop4b // less than 4 pixels? - - // 4 pixel loop. - convertloop4: - movdqu xmm3, [eax] // src argb - lea eax, [eax + 16] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movdqu xmm2, [esi] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3, 0F5h // 8 alpha words - pshuflw xmm3, xmm3, 0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movdqu xmm1, [esi] // _a_g - lea esi, [esi + 16] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jge convertloop4 - - convertloop4b: - add ecx, 4 - 1 - jl convertloop1b - - // 1 pixel loop. - convertloop1: - movd xmm3, [eax] // src argb - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3, 0F5h // 8 alpha words - pshuflw xmm3, xmm3, 0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge convertloop1 - - convertloop1b: - pop esi - ret - } -} -#endif // HAS_ARGBBLENDROW_SSE2 - #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = { 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 }; -// Same as SSE2, but replaces: -// psrlw xmm3, 8 // alpha -// pshufhw xmm3, xmm3, 0F5h // 8 alpha words -// pshuflw xmm3, xmm3, 0F5h -// with.. -// pshufb xmm3, kShuffleAlpha // alpha -// Blend 8 pixels at a time. +// Blend 8 pixels at a time. __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { @@ -4564,48 +4445,6 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 -#ifdef HAS_ARGBATTENUATEROW_SSE2 -// Attenuate 4 pixels at a time. -__declspec(naked) -void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { - __asm { - mov eax, [esp + 4] // src_argb0 - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - pcmpeqb xmm4, xmm4 // generate mask 0xff000000 - pslld xmm4, 24 - pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff - psrld xmm5, 8 - - convertloop: - movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm0 // first 2 - pshufhw xmm2, xmm0, 0FFh // 8 alpha words - pshuflw xmm2, xmm2, 0FFh - pmulhuw xmm0, xmm2 // rgb * a - movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm1 // next 2 pixels - pshufhw xmm2, xmm1, 0FFh // 8 alpha words - pshuflw xmm2, xmm2, 0FFh - pmulhuw xmm1, xmm2 // rgb * a - movdqu xmm2, [eax] // alphas - lea eax, [eax + 16] - psrlw xmm0, 8 - pand xmm2, xmm4 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - pand xmm0, xmm5 // keep original alphas - por xmm0, xmm2 - movdqu [edx], xmm0 - lea edx, [edx + 16] - sub ecx, 4 - jg convertloop - - ret - } -} -#endif // HAS_ARGBATTENUATEROW_SSE2 - #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = {