diff --git a/source/compare.cc b/source/compare.cc index d7086bf1e..1cbb72e3e 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -23,35 +23,30 @@ namespace libyuv { static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { volatile uint32 sse; - asm volatile - ( - "vmov.u8 q7, #0\n" - "vmov.u8 q9, #0\n" - "vmov.u8 q8, #0\n" - "vmov.u8 q10, #0\n" + asm volatile ( + "vmov.u8 q7, #0 \n" + "vmov.u8 q9, #0 \n" + "vmov.u8 q8, #0 \n" + "vmov.u8 q10, #0 \n" - "1:\n" - "vld1.u8 {q0}, [%0]!\n" - "vld1.u8 {q1}, [%1]!\n" - - "vsubl.u8 q2, d0, d2\n" - "vsubl.u8 q3, d1, d3\n" - - "vmlal.s16 q7, d4, d4\n" - "vmlal.s16 q8, d6, d6\n" - "vmlal.s16 q8, d5, d5\n" - "vmlal.s16 q10, d7, d7\n" - - "subs %2, %2, #16\n" - "bhi 1b\n" - - "vadd.u32 q7, q7, q8\n" - "vadd.u32 q9, q9, q10\n" - "vadd.u32 q10, q7, q9\n" - "vpaddl.u32 q1, q10\n" - "vadd.u64 d0, d2, d3\n" - "vmov.32 %3, d0[0]\n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" + "vld1.u8 {q1}, [%1]! \n" + "vsubl.u8 q2, d0, d2 \n" + "vsubl.u8 q3, d1, d3 \n" + "vmlal.s16 q7, d4, d4 \n" + "vmlal.s16 q8, d6, d6 \n" + "vmlal.s16 q8, d5, d5 \n" + "vmlal.s16 q10, d7, d7 \n" + "subs %2, %2, #16 \n" + "bhi 1b \n" + "vadd.u32 q7, q7, q8 \n" + "vadd.u32 q9, q9, q10 \n" + "vadd.u32 q10, q7, q9 \n" + "vpaddl.u32 q1, q10 \n" + "vadd.u64 d0, d2, d3 \n" + "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), @@ -59,7 +54,6 @@ static uint32 SumSquareError_NEON(const uint8* src_a, : : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10" ); - return sse; } @@ -102,7 +96,6 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, pshufd xmm1, xmm0, 01h paddd xmm0, xmm1 movd eax, xmm0 - ret } } @@ -112,11 +105,12 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, // DISABLE //#define HAS_SUMSQUAREERROR_SSE2 // DISABLE +#if HAS_SUMSQUAREERROR_SSE2 static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { volatile uint32 sse; - asm volatile( - "\n" + asm volatile ( + " \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 @@ -131,6 +125,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, } #endif #endif +#endif static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) { @@ -148,7 +143,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b, int count) { uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count); - #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; @@ -162,10 +156,8 @@ uint64 ComputeSumSquareError(const uint8* src_a, { SumSquareError = SumSquareError_C; } - const int kBlockSize = 4096; uint64 diff = 0; - while (count >= kBlockSize) { diff += SumSquareError(src_a, src_b, kBlockSize); src_a += kBlockSize; @@ -179,7 +171,6 @@ uint64 ComputeSumSquareError(const uint8* src_a, diff += static_cast(SumSquareError_C(src_a, src_b, count)); } } - return diff; } @@ -188,7 +179,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, int width, int height) { uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count); - #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON) && (width % 16 == 0)) { @@ -200,7 +190,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, } uint64 sse = 0; - for (int h = 0; h < height; ++h) { sse += static_cast(SumSquareError(src_a, src_b, width)); src_a += stride_a; @@ -210,11 +199,10 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, return sse; } -double Sse2Psnr(double Samples, double Sse) { +double Sse2Psnr(double samples, double sse) { double psnr; - - if (Sse > 0.0) - psnr = 10.0 * log10(255.0 * 255.0 * Samples / Sse); + if (sse > 0.0) + psnr = 10.0 * log10(255.0 * 255.0 * samples / sse); else psnr = kMaxPsnr; // Limit to prevent divide by 0 @@ -224,6 +212,21 @@ double Sse2Psnr(double Samples, double Sse) { return psnr; } +double Sse2Psnr(uint64 samples, uint64 sse) { + double psnr; + if (sse > 0) { + double mse = static_cast(samples) / static_cast(sse); + psnr = 10.0 * log10(255.0 * 255.0 * mse); + } else { + psnr = kMaxPsnr; // Limit to prevent divide by 0 + } + + if (psnr > kMaxPsnr) + psnr = kMaxPsnr; + + return psnr; +} + double CalcFramePsnr(const uint8* src_a, int stride_a, const uint8* src_b, int stride_b, int width, int height) { @@ -233,7 +236,7 @@ double CalcFramePsnr(const uint8* src_a, int stride_a, src_b, stride_b, width, height); - return Sse2Psnr (samples, sse); + return Sse2Psnr(samples, sse); } double I420Psnr(const uint8* src_y_a, int stride_y_a, diff --git a/source/cpu_id.cc b/source/cpu_id.cc index a936fafb5..4bbb90fbe 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -22,9 +22,9 @@ #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) static inline void __cpuid(int cpu_info[4], int info_type) { asm volatile ( - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" + "mov %%ebx, %%edi \n" + "cpuid \n" + "xchg %%edi, %%ebx \n" : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type) ); @@ -32,7 +32,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) { #elif defined(__i386__) || defined(__x86_64__) static inline void __cpuid(int cpu_info[4], int info_type) { asm volatile ( - "cpuid\n" + "cpuid \n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type) ); diff --git a/source/format_conversion.cc b/source/format_conversion.cc index ebdc38788..8ead34191 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -50,17 +50,17 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb, #define HAS_ARGBTOBAYERROW_SSSE3 static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { - asm volatile( - "movd %3,%%xmm5\n" - "pshufd $0x0,%%xmm5,%%xmm5\n" -"1:\n" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm5,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "lea 0x4(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + asm volatile ( + "movd %3,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 05f23e613..ef4ad844f 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -23,14 +23,13 @@ namespace libyuv { // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. static void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { - __asm__ volatile - ( - "1:\n" - "vld2.u8 {q0,q1}, [%0]!\n" // load 16 pairs of UV - "vst1.u8 {q0}, [%1]!\n" // store U - "vst1.u8 {q1}, [%2]!\n" // Store V - "subs %3, %3, #16\n" // 16 processed per loop - "bhi 1b\n" + asm volatile ( + "1: \n" + "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV + "vst1.u8 {q0}, [%1]! \n" // store U + "vst1.u8 {q1}, [%2]! \n" // Store V + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" : "+r"(src_uv), "+r"(dst_u), "+r"(dst_v), @@ -57,7 +56,7 @@ static void SplitUV_SSE2(const uint8* src_uv, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - wloop: + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] @@ -74,7 +73,7 @@ static void SplitUV_SSE2(const uint8* src_uv, movdqa [edi], xmm2 lea edi, [edi + 16] sub ecx, 16 - ja wloop + ja convertloop pop edi ret } @@ -85,27 +84,27 @@ static void SplitUV_SSE2(const uint8* src_uv, #define HAS_SPLITUV_SSE2 static void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "pand %%xmm5,%%xmm0\n" - "pand %%xmm5,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "psrlw $0x8,%%xmm2\n" - "psrlw $0x8,%%xmm3\n" - "packuswb %%xmm3,%%xmm2\n" - "movdqa %%xmm2,(%2)\n" - "lea 0x10(%2),%2\n" - "sub $0x10,%3\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x10,%3 \n" + "ja 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -239,13 +238,12 @@ int I420Mirror(const uint8* src_y, int src_stride_y, #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) #define HAS_SETROW_NEON static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { - __asm__ volatile - ( - "vdup.u32 q0, %2\n" // duplicate 4 ints - "1:\n" - "vst1.u32 {q0}, [%0]!\n" // store - "subs %1, %1, #16\n" // 16 processed per loop - "bhi 1b\n" + asm volatile ( + "vdup.u32 q0, %2 \n" // duplicate 4 ints + "1: \n" + "vst1.u32 {q0}, [%0]! \n" // store + "subs %1, %1, #16 \n" // 16 processed per loop + "bhi 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 @@ -263,11 +261,11 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { mov ecx, [esp + 12] // count pshufd xmm5, xmm5, 0 - wloop: + convertloop: movdqa [eax], xmm5 lea eax, [eax + 16] sub ecx, 16 - ja wloop + ja convertloop ret } } @@ -277,14 +275,14 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { #define HAS_SETROW_SSE2 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { - asm volatile( - "movd %2, %%xmm5\n" - "pshufd $0x0,%%xmm5,%%xmm5\n" -"1:" - "movdqa %%xmm5,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%1\n" - "ja 1b\n" + asm volatile ( + "movd %2, %%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" +"1: \n" + "movdqa %%xmm5,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%1 \n" + "ja 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 @@ -561,7 +559,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - wloop: + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] @@ -585,7 +583,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, movq qword ptr [edi], xmm1 lea edi, [edi + 8] sub ecx, 16 - ja wloop + ja convertloop pop edi pop esi @@ -598,34 +596,34 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, #define HAS_SPLITYUY2_SSE2 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "pand %%xmm5,%%xmm2\n" - "pand %%xmm5,%%xmm3\n" - "packuswb %%xmm3,%%xmm2\n" - "movdqa %%xmm2,(%1)\n" - "lea 0x10(%1),%1\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm5,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%2)\n" - "lea 0x8(%2),%2\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm1\n" - "movq %%xmm1,(%3)\n" - "lea 0x8(%3),%3\n" - "sub $0x10,%4\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,(%1) \n" + "lea 0x10(%1),%1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm1,(%3) \n" + "lea 0x8(%3),%3 \n" + "sub $0x10,%4 \n" + "ja 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(dst_u), // %2 @@ -716,7 +714,7 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - wloop: + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] @@ -726,7 +724,7 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja wloop + ja convertloop ret } } @@ -745,7 +743,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - wloop: + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + esi] @@ -766,7 +764,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, movq qword ptr [edi], xmm1 lea edi, [edi + 8] sub ecx, 16 - ja wloop + ja convertloop pop edi pop esi @@ -783,7 +781,7 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy, mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // pix - wloop: + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] @@ -793,7 +791,7 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy, movdqa [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 - ja wloop + ja convertloop ret } } @@ -812,7 +810,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 - wloop: + convertloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] movdqa xmm2, [eax + esi] @@ -833,7 +831,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, movq qword ptr [edi], xmm1 lea edi, [edi + 8] sub ecx, 16 - ja wloop + ja convertloop pop edi pop esi @@ -847,20 +845,20 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, #define HAS_YUY2TOI420ROW_SSE2 static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm5,%%xmm0\n" - "pand %%xmm5,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -874,31 +872,31 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_y, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%4,1),%%xmm2\n" - "movdqa 0x10(%0,%4,1),%%xmm3\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm5,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm1\n" - "movq %%xmm1,(%2)\n" - "lea 0x8(%2),%2\n" - "sub $0x10,%3\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm1,(%2) \n" + "lea 0x8(%2),%2 \n" + "sub $0x10,%3 \n" + "ja 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_y), // %2 @@ -913,18 +911,18 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, #define HAS_UYVYTOI420ROW_SSE2 static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { - asm volatile( -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -938,31 +936,31 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_y, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%4,1),%%xmm2\n" - "movdqa 0x10(%0,%4,1),%%xmm3\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "pand %%xmm5,%%xmm0\n" - "pand %%xmm5,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "pand %%xmm5,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm1\n" - "movq %%xmm1,(%2)\n" - "lea 0x8(%2),%2\n" - "sub $0x10,%3\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm1,(%2) \n" + "lea 0x8(%2),%2 \n" + "sub $0x10,%3 \n" + "ja 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_y), // %2 diff --git a/source/rotate.cc b/source/rotate.cc index efd674d86..3581dff1d 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -282,78 +282,78 @@ __asm { #define HAS_TRANSPOSE_WX8_SSSE3 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - asm volatile( + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. -"1:\n" - "movq (%0),%%xmm0\n" - "movq (%0,%3),%%xmm1\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm1,%%xmm0\n" - "movq (%0),%%xmm2\n" - "movdqa %%xmm0,%%xmm1\n" - "palignr $0x8,%%xmm1,%%xmm1\n" - "movq (%0,%3),%%xmm3\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm3,%%xmm2\n" - "movdqa %%xmm2,%%xmm3\n" - "movq (%0),%%xmm4\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "movq (%0,%3),%%xmm5\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm5,%%xmm4\n" - "movdqa %%xmm4,%%xmm5\n" - "movq (%0),%%xmm6\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "movq (%0,%3),%%xmm7\n" - "lea (%0,%3,2),%0\n" - "punpcklbw %%xmm7,%%xmm6\n" - "neg %3\n" - "movdqa %%xmm6,%%xmm7\n" - "lea 0x8(%0,%3,8),%0\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "neg %3\n" +"1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0\n" - "punpcklwd %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "palignr $0x8,%%xmm2,%%xmm2\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "punpcklwd %%xmm6,%%xmm4\n" - "punpcklwd %%xmm7,%%xmm5\n" - "movdqa %%xmm4,%%xmm6\n" - "movdqa %%xmm5,%%xmm7\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "palignr $0x8,%%xmm7,%%xmm7\n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqa %%xmm0,%%xmm4\n" - "palignr $0x8,%%xmm4,%%xmm4\n" - "movq %%xmm4,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm6,%%xmm2\n" - "movdqa %%xmm2,%%xmm6\n" - "movq %%xmm2,(%1)\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "punpckldq %%xmm5,%%xmm1\n" - "movq %%xmm6,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "movdqa %%xmm1,%%xmm5\n" - "movq %%xmm1,(%1)\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "movq %%xmm5,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm7,%%xmm3\n" - "movq %%xmm3,(%1)\n" - "movdqa %%xmm3,%%xmm7\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "movq %%xmm7,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -372,258 +372,258 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w); - asm( - ".text\n" + asm volatile ( + ".text \n" #if defined(OSX) - ".globl _TransposeUVWx8_SSE2\n" -"_TransposeUVWx8_SSE2:\n" + ".globl _TransposeUVWx8_SSE2 \n" +"_TransposeUVWx8_SSE2: \n" #else - ".global TransposeUVWx8_SSE2\n" -"TransposeUVWx8_SSE2:\n" + ".global TransposeUVWx8_SSE2 \n" +"TransposeUVWx8_SSE2: \n" #endif - "push %ebx\n" - "push %esi\n" - "push %edi\n" - "push %ebp\n" - "mov 0x14(%esp),%eax\n" - "mov 0x18(%esp),%edi\n" - "mov 0x1c(%esp),%edx\n" - "mov 0x20(%esp),%esi\n" - "mov 0x24(%esp),%ebx\n" - "mov 0x28(%esp),%ebp\n" - "mov %esp,%ecx\n" - "sub $0x14,%esp\n" - "and $0xfffffff0,%esp\n" - "mov %ecx,0x10(%esp)\n" - "mov 0x2c(%ecx),%ecx\n" + "push %ebx \n" + "push %esi \n" + "push %edi \n" + "push %ebp \n" + "mov 0x14(%esp),%eax \n" + "mov 0x18(%esp),%edi \n" + "mov 0x1c(%esp),%edx \n" + "mov 0x20(%esp),%esi \n" + "mov 0x24(%esp),%ebx \n" + "mov 0x28(%esp),%ebp \n" + "mov %esp,%ecx \n" + "sub $0x14,%esp \n" + "and $0xfffffff0,%esp \n" + "mov %ecx,0x10(%esp) \n" + "mov 0x2c(%ecx),%ecx \n" -"1:\n" - "movdqa (%eax),%xmm0\n" - "movdqa (%eax,%edi,1),%xmm1\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm0,%xmm7\n" - "punpcklbw %xmm1,%xmm0\n" - "punpckhbw %xmm1,%xmm7\n" - "movdqa %xmm7,%xmm1\n" - "movdqa (%eax),%xmm2\n" - "movdqa (%eax,%edi,1),%xmm3\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm2,%xmm7\n" - "punpcklbw %xmm3,%xmm2\n" - "punpckhbw %xmm3,%xmm7\n" - "movdqa %xmm7,%xmm3\n" - "movdqa (%eax),%xmm4\n" - "movdqa (%eax,%edi,1),%xmm5\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm4,%xmm7\n" - "punpcklbw %xmm5,%xmm4\n" - "punpckhbw %xmm5,%xmm7\n" - "movdqa %xmm7,%xmm5\n" - "movdqa (%eax),%xmm6\n" - "movdqa (%eax,%edi,1),%xmm7\n" - "lea (%eax,%edi,2),%eax\n" - "movdqa %xmm5,(%esp)\n" - "neg %edi\n" - "movdqa %xmm6,%xmm5\n" - "punpcklbw %xmm7,%xmm6\n" - "punpckhbw %xmm7,%xmm5\n" - "movdqa %xmm5,%xmm7\n" - "lea 0x10(%eax,%edi,8),%eax\n" - "neg %edi\n" - "movdqa %xmm0,%xmm5\n" - "punpcklwd %xmm2,%xmm0\n" - "punpckhwd %xmm2,%xmm5\n" - "movdqa %xmm5,%xmm2\n" - "movdqa %xmm1,%xmm5\n" - "punpcklwd %xmm3,%xmm1\n" - "punpckhwd %xmm3,%xmm5\n" - "movdqa %xmm5,%xmm3\n" - "movdqa %xmm4,%xmm5\n" - "punpcklwd %xmm6,%xmm4\n" - "punpckhwd %xmm6,%xmm5\n" - "movdqa %xmm5,%xmm6\n" - "movdqa (%esp),%xmm5\n" - "movdqa %xmm6,(%esp)\n" - "movdqa %xmm5,%xmm6\n" - "punpcklwd %xmm7,%xmm5\n" - "punpckhwd %xmm7,%xmm6\n" - "movdqa %xmm6,%xmm7\n" - "movdqa %xmm0,%xmm6\n" - "punpckldq %xmm4,%xmm0\n" - "punpckhdq %xmm4,%xmm6\n" - "movdqa %xmm6,%xmm4\n" - "movdqa (%esp),%xmm6\n" - "movlpd %xmm0,(%edx)\n" - "movhpd %xmm0,(%ebx)\n" - "movlpd %xmm4,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm4,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "movdqa %xmm2,%xmm0\n" - "punpckldq %xmm6,%xmm2\n" - "movlpd %xmm2,(%edx)\n" - "movhpd %xmm2,(%ebx)\n" - "punpckhdq %xmm6,%xmm0\n" - "movlpd %xmm0,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm0,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "movdqa %xmm1,%xmm0\n" - "punpckldq %xmm5,%xmm1\n" - "movlpd %xmm1,(%edx)\n" - "movhpd %xmm1,(%ebx)\n" - "punpckhdq %xmm5,%xmm0\n" - "movlpd %xmm0,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm0,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "movdqa %xmm3,%xmm0\n" - "punpckldq %xmm7,%xmm3\n" - "movlpd %xmm3,(%edx)\n" - "movhpd %xmm3,(%ebx)\n" - "punpckhdq %xmm7,%xmm0\n" - "movlpd %xmm0,(%edx,%esi,1)\n" - "lea (%edx,%esi,2),%edx\n" - "movhpd %xmm0,(%ebx,%ebp,1)\n" - "lea (%ebx,%ebp,2),%ebx\n" - "sub $0x8,%ecx\n" - "ja 1b\n" - "mov 0x10(%esp),%esp\n" - "pop %ebp\n" - "pop %edi\n" - "pop %esi\n" - "pop %ebx\n" - "ret\n" +"1: \n" + "movdqa (%eax),%xmm0 \n" + "movdqa (%eax,%edi,1),%xmm1 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm0,%xmm7 \n" + "punpcklbw %xmm1,%xmm0 \n" + "punpckhbw %xmm1,%xmm7 \n" + "movdqa %xmm7,%xmm1 \n" + "movdqa (%eax),%xmm2 \n" + "movdqa (%eax,%edi,1),%xmm3 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm2,%xmm7 \n" + "punpcklbw %xmm3,%xmm2 \n" + "punpckhbw %xmm3,%xmm7 \n" + "movdqa %xmm7,%xmm3 \n" + "movdqa (%eax),%xmm4 \n" + "movdqa (%eax,%edi,1),%xmm5 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm4,%xmm7 \n" + "punpcklbw %xmm5,%xmm4 \n" + "punpckhbw %xmm5,%xmm7 \n" + "movdqa %xmm7,%xmm5 \n" + "movdqa (%eax),%xmm6 \n" + "movdqa (%eax,%edi,1),%xmm7 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm5,(%esp) \n" + "neg %edi \n" + "movdqa %xmm6,%xmm5 \n" + "punpcklbw %xmm7,%xmm6 \n" + "punpckhbw %xmm7,%xmm5 \n" + "movdqa %xmm5,%xmm7 \n" + "lea 0x10(%eax,%edi,8),%eax \n" + "neg %edi \n" + "movdqa %xmm0,%xmm5 \n" + "punpcklwd %xmm2,%xmm0 \n" + "punpckhwd %xmm2,%xmm5 \n" + "movdqa %xmm5,%xmm2 \n" + "movdqa %xmm1,%xmm5 \n" + "punpcklwd %xmm3,%xmm1 \n" + "punpckhwd %xmm3,%xmm5 \n" + "movdqa %xmm5,%xmm3 \n" + "movdqa %xmm4,%xmm5 \n" + "punpcklwd %xmm6,%xmm4 \n" + "punpckhwd %xmm6,%xmm5 \n" + "movdqa %xmm5,%xmm6 \n" + "movdqa (%esp),%xmm5 \n" + "movdqa %xmm6,(%esp) \n" + "movdqa %xmm5,%xmm6 \n" + "punpcklwd %xmm7,%xmm5 \n" + "punpckhwd %xmm7,%xmm6 \n" + "movdqa %xmm6,%xmm7 \n" + "movdqa %xmm0,%xmm6 \n" + "punpckldq %xmm4,%xmm0 \n" + "punpckhdq %xmm4,%xmm6 \n" + "movdqa %xmm6,%xmm4 \n" + "movdqa (%esp),%xmm6 \n" + "movlpd %xmm0,(%edx) \n" + "movhpd %xmm0,(%ebx) \n" + "movlpd %xmm4,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm4,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm2,%xmm0 \n" + "punpckldq %xmm6,%xmm2 \n" + "movlpd %xmm2,(%edx) \n" + "movhpd %xmm2,(%ebx) \n" + "punpckhdq %xmm6,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm1,%xmm0 \n" + "punpckldq %xmm5,%xmm1 \n" + "movlpd %xmm1,(%edx) \n" + "movhpd %xmm1,(%ebx) \n" + "punpckhdq %xmm5,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm3,%xmm0 \n" + "punpckldq %xmm7,%xmm3 \n" + "movlpd %xmm3,(%edx) \n" + "movhpd %xmm3,(%ebx) \n" + "punpckhdq %xmm7,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "sub $0x8,%ecx \n" + "ja 1b \n" + "mov 0x10(%esp),%esp \n" + "pop %ebp \n" + "pop %edi \n" + "pop %esi \n" + "pop %ebx \n" + "ret \n" ); #elif defined (__x86_64__) // 64 bit version has enough registers to do 16x8 to 8x16 at a time. #define HAS_TRANSPOSE_WX8_FAST_SSSE3 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - asm volatile( + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. -"1:\n" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3),%%xmm1\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm0,%%xmm8\n" - "punpcklbw %%xmm1,%%xmm0\n" - "punpckhbw %%xmm1,%%xmm8\n" - "movdqa (%0),%%xmm2\n" - "movdqa %%xmm0,%%xmm1\n" - "movdqa %%xmm8,%%xmm9\n" - "palignr $0x8,%%xmm1,%%xmm1\n" - "palignr $0x8,%%xmm9,%%xmm9\n" - "movdqa (%0,%3),%%xmm3\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm2,%%xmm10\n" - "punpcklbw %%xmm3,%%xmm2\n" - "punpckhbw %%xmm3,%%xmm10\n" - "movdqa %%xmm2,%%xmm3\n" - "movdqa %%xmm10,%%xmm11\n" - "movdqa (%0),%%xmm4\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "palignr $0x8,%%xmm11,%%xmm11\n" - "movdqa (%0,%3),%%xmm5\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm4,%%xmm12\n" - "punpcklbw %%xmm5,%%xmm4\n" - "punpckhbw %%xmm5,%%xmm12\n" - "movdqa %%xmm4,%%xmm5\n" - "movdqa %%xmm12,%%xmm13\n" - "movdqa (%0),%%xmm6\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "palignr $0x8,%%xmm13,%%xmm13\n" - "movdqa (%0,%3),%%xmm7\n" - "lea (%0,%3,2),%0\n" - "movdqa %%xmm6,%%xmm14\n" - "punpcklbw %%xmm7,%%xmm6\n" - "punpckhbw %%xmm7,%%xmm14\n" - "neg %3\n" - "movdqa %%xmm6,%%xmm7\n" - "movdqa %%xmm14,%%xmm15\n" - "lea 0x10(%0,%3,8),%0\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "palignr $0x8,%%xmm15,%%xmm15\n" - "neg %3\n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqa (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqa (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqa (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqa (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqa (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0\n" - "punpcklwd %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "palignr $0x8,%%xmm2,%%xmm2\n" - "palignr $0x8,%%xmm3,%%xmm3\n" - "punpcklwd %%xmm6,%%xmm4\n" - "punpcklwd %%xmm7,%%xmm5\n" - "movdqa %%xmm4,%%xmm6\n" - "movdqa %%xmm5,%%xmm7\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "punpcklwd %%xmm10,%%xmm8\n" - "punpcklwd %%xmm11,%%xmm9\n" - "movdqa %%xmm8,%%xmm10\n" - "movdqa %%xmm9,%%xmm11\n" - "palignr $0x8,%%xmm10,%%xmm10\n" - "palignr $0x8,%%xmm11,%%xmm11\n" - "punpcklwd %%xmm14,%%xmm12\n" - "punpcklwd %%xmm15,%%xmm13\n" - "movdqa %%xmm12,%%xmm14\n" - "movdqa %%xmm13,%%xmm15\n" - "palignr $0x8,%%xmm14,%%xmm14\n" - "palignr $0x8,%%xmm15,%%xmm15\n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" // Third round of bit swap. // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqa %%xmm0,%%xmm4\n" - "palignr $0x8,%%xmm4,%%xmm4\n" - "movq %%xmm4,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm6,%%xmm2\n" - "movdqa %%xmm2,%%xmm6\n" - "movq %%xmm2,(%1)\n" - "palignr $0x8,%%xmm6,%%xmm6\n" - "punpckldq %%xmm5,%%xmm1\n" - "movq %%xmm6,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "movdqa %%xmm1,%%xmm5\n" - "movq %%xmm1,(%1)\n" - "palignr $0x8,%%xmm5,%%xmm5\n" - "movq %%xmm5,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm7,%%xmm3\n" - "movq %%xmm3,(%1)\n" - "movdqa %%xmm3,%%xmm7\n" - "palignr $0x8,%%xmm7,%%xmm7\n" - "movq %%xmm7,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm12,%%xmm8\n" - "movq %%xmm8,(%1)\n" - "movdqa %%xmm8,%%xmm12\n" - "palignr $0x8,%%xmm12,%%xmm12\n" - "movq %%xmm12,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm14,%%xmm10\n" - "movdqa %%xmm10,%%xmm14\n" - "movq %%xmm10,(%1)\n" - "palignr $0x8,%%xmm14,%%xmm14\n" - "punpckldq %%xmm13,%%xmm9\n" - "movq %%xmm14,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "movdqa %%xmm9,%%xmm13\n" - "movq %%xmm9,(%1)\n" - "palignr $0x8,%%xmm13,%%xmm13\n" - "movq %%xmm13,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "punpckldq %%xmm15,%%xmm11\n" - "movq %%xmm11,(%1)\n" - "movdqa %%xmm11,%%xmm15\n" - "palignr $0x8,%%xmm15,%%xmm15\n" - "movq %%xmm15,(%1,%4)\n" - "lea (%1,%4,2),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -640,98 +640,98 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int w) { - asm volatile( + asm volatile ( // Read in the data from the source pointer. // First round of bit swap. -"1:\n" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%4),%%xmm1\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm0,%%xmm8\n" - "punpcklbw %%xmm1,%%xmm0\n" - "punpckhbw %%xmm1,%%xmm8\n" - "movdqa %%xmm8,%%xmm1\n" - "movdqa (%0),%%xmm2\n" - "movdqa (%0,%4),%%xmm3\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm2,%%xmm8\n" - "punpcklbw %%xmm3,%%xmm2\n" - "punpckhbw %%xmm3,%%xmm8\n" - "movdqa %%xmm8,%%xmm3\n" - "movdqa (%0),%%xmm4\n" - "movdqa (%0,%4),%%xmm5\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm4,%%xmm8\n" - "punpcklbw %%xmm5,%%xmm4\n" - "punpckhbw %%xmm5,%%xmm8\n" - "movdqa %%xmm8,%%xmm5\n" - "movdqa (%0),%%xmm6\n" - "movdqa (%0,%4),%%xmm7\n" - "lea (%0,%4,2),%0\n" - "movdqa %%xmm6,%%xmm8\n" - "punpcklbw %%xmm7,%%xmm6\n" - "neg %4\n" - "lea 0x10(%0,%4,8),%0\n" - "punpckhbw %%xmm7,%%xmm8\n" - "movdqa %%xmm8,%%xmm7\n" - "neg %4\n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqa (%0),%%xmm4 \n" + "movdqa (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" // Second round of bit swap. - "movdqa %%xmm0,%%xmm8\n" - "movdqa %%xmm1,%%xmm9\n" - "punpckhwd %%xmm2,%%xmm8\n" - "punpckhwd %%xmm3,%%xmm9\n" - "punpcklwd %%xmm2,%%xmm0\n" - "punpcklwd %%xmm3,%%xmm1\n" - "movdqa %%xmm8,%%xmm2\n" - "movdqa %%xmm9,%%xmm3\n" - "movdqa %%xmm4,%%xmm8\n" - "movdqa %%xmm5,%%xmm9\n" - "punpckhwd %%xmm6,%%xmm8\n" - "punpckhwd %%xmm7,%%xmm9\n" - "punpcklwd %%xmm6,%%xmm4\n" - "punpcklwd %%xmm7,%%xmm5\n" - "movdqa %%xmm8,%%xmm6\n" - "movdqa %%xmm9,%%xmm7\n" + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8\n" - "punpckldq %%xmm4,%%xmm0\n" - "movlpd %%xmm0,(%1)\n" // Write back U channel - "movhpd %%xmm0,(%2)\n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "movdqa %%xmm2,%%xmm8\n" - "punpckldq %%xmm6,%%xmm2\n" - "movlpd %%xmm2,(%1)\n" - "movhpd %%xmm2,(%2)\n" - "punpckhdq %%xmm6,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "movdqa %%xmm1,%%xmm8\n" - "punpckldq %%xmm5,%%xmm1\n" - "movlpd %%xmm1,(%1)\n" - "movhpd %%xmm1,(%2)\n" - "punpckhdq %%xmm5,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "movdqa %%xmm3,%%xmm8\n" - "punpckldq %%xmm7,%%xmm3\n" - "movlpd %%xmm3,(%1)\n" - "movhpd %%xmm3,(%2)\n" - "punpckhdq %%xmm7,%%xmm8\n" - "movlpd %%xmm8,(%1,%5)\n" - "lea (%1,%5,2),%1\n" - "movhpd %%xmm8,(%2,%6)\n" - "lea (%2,%6,2),%2\n" - "sub $0x8,%3\n" - "ja 1b\n" + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "sub $0x8,%3 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 @@ -882,17 +882,17 @@ __asm { #define HAS_REVERSE_LINE_SSSE3 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); - asm volatile( - "movdqa (%3),%%xmm5\n" - "lea -0x10(%0,%2,1),%0\n" -"1:\n" - "movdqa (%0),%%xmm0\n" - "lea -0x10(%0),%0\n" - "pshufb %%xmm5,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "movdqa (%3),%%xmm5 \n" + "lea -0x10(%0,%2,1),%0 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -1091,19 +1091,19 @@ void ReverseLineUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { intptr_t temp_width = static_cast(width); - asm volatile( - "movdqa (%4),%%xmm5\n" - "lea -0x10(%0,%3,2),%0\n" -"1:\n" - "movdqa (%0),%%xmm0\n" - "lea -0x10(%0),%0\n" - "pshufb %%xmm5,%%xmm0\n" - "movlpd %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "movhpd %%xmm0,(%2)\n" - "lea 0x8(%2),%2\n" - "sub $0x8,%3\n" - "ja 1b\n" + asm volatile ( + "movdqa (%4),%%xmm5 \n" + "lea -0x10(%0,%3,2),%0 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "movhpd %%xmm0,(%2) \n" + "lea 0x8(%2),%2 \n" + "sub $0x8,%3 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index f717c5847..272d41fbd 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -15,12 +15,12 @@ namespace libyuv { #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) void ReverseLine_NEON(const uint8* src, uint8* dst, int width) { - asm volatile( + asm volatile ( // compute where to start writing destination - "add %1, %2\n" + "add %1, %2 \n" // work on segments that are multiples of 16 - "lsrs r3, %2, #4\n" + "lsrs r3, %2, #4 \n" // the output is written in two block. 8 bytes followed // by another 8. reading is done sequentially, from left to @@ -28,72 +28,72 @@ void ReverseLine_NEON(const uint8* src, uint8* dst, int width) { // %1, the destination pointer is incremented after writing // the first of the two blocks. need to subtract that 8 off // along with 16 to get the next location. - "mov r3, #-24\n" + "mov r3, #-24 \n" - "beq 2f\n" + "beq 2f \n" // back of destination by the size of the register that is // going to be reversed - "sub %1, #16\n" + "sub %1, #16 \n" // the loop needs to run on blocks of 16. what will be left // over is either a negative number, the residuals that need // to be done, or 0. if this isn't subtracted off here the // loop will run one extra time. - "sub %2, #16\n" + "sub %2, #16 \n" - "1:\n" - "vld1.8 {q0}, [%0]!\n" // src += 16 + "1: \n" + "vld1.8 {q0}, [%0]! \n" // src += 16 // reverse the bytes in the 64 bit segments. unable to reverse // the bytes in the entire 128 bits in one go. - "vrev64.8 q0, q0\n" + "vrev64.8 q0, q0 \n" // because of the inability to reverse the entire 128 bits // reverse the writing out of the two 64 bit segments. - "vst1.8 {d1}, [%1]!\n" - "vst1.8 {d0}, [%1], r3\n" // dst -= 16 + "vst1.8 {d1}, [%1]! \n" + "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 - "subs %2, #16\n" - "bge 1b\n" + "subs %2, #16 \n" + "bge 1b \n" // add 16 back to the counter. if the result is 0 there is no // residuals so jump past - "adds %2, #16\n" - "beq 5f\n" + "adds %2, #16 \n" + "beq 5f \n" - "add %1, #16\n" + "add %1, #16 \n" - "2:\n" + "2: \n" - "mov r3, #-3\n" + "mov r3, #-3 \n" - "sub %1, #2\n" - "subs %2, #2\n" + "sub %1, #2 \n" + "subs %2, #2 \n" // check for 16*n+1 scenarios where segments_of_2 should not // be run, but there is something left over. - "blt 4f\n" + "blt 4f \n" // do this in neon registers as per // http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ - "3:\n" - "vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2 + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 - "vst1.8 {d1[0]}, [%1]!\n" - "vst1.8 {d0[0]}, [%1], r3\n" // dst -= 2 + "vst1.8 {d1[0]}, [%1]! \n" + "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 - "subs %2, #2\n" - "bge 3b\n" + "subs %2, #2 \n" + "bge 3b \n" - "adds %2, #2\n" - "beq 5f\n" + "adds %2, #2 \n" + "beq 5f \n" - "4:\n" - "add %1, #1\n" - "vld1.8 {d0[0]}, [%0]\n" - "vst1.8 {d0[0]}, [%1]\n" + "4: \n" + "add %1, #1 \n" + "vld1.8 {d0[0]}, [%0] \n" + "vst1.8 {d0[0]}, [%1] \n" - "5:\n" + "5: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -108,154 +108,154 @@ static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) = void TransposeWx8_NEON(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width) { - asm volatile( + asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %4, #8\n" + "sub %4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - "1:\n" - "mov r9, %0\n" + "1: \n" + "mov r9, %0 \n" - "vld1.8 {d0}, [r9], %1\n" - "vld1.8 {d1}, [r9], %1\n" - "vld1.8 {d2}, [r9], %1\n" - "vld1.8 {d3}, [r9], %1\n" - "vld1.8 {d4}, [r9], %1\n" - "vld1.8 {d5}, [r9], %1\n" - "vld1.8 {d6}, [r9], %1\n" - "vld1.8 {d7}, [r9]\n" + "vld1.8 {d0}, [r9], %1 \n" + "vld1.8 {d1}, [r9], %1 \n" + "vld1.8 {d2}, [r9], %1 \n" + "vld1.8 {d3}, [r9], %1 \n" + "vld1.8 {d4}, [r9], %1 \n" + "vld1.8 {d5}, [r9], %1 \n" + "vld1.8 {d6}, [r9], %1 \n" + "vld1.8 {d7}, [r9] \n" - "vtrn.8 d1, d0\n" - "vtrn.8 d3, d2\n" - "vtrn.8 d5, d4\n" - "vtrn.8 d7, d6\n" + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" - "vtrn.16 d1, d3\n" - "vtrn.16 d0, d2\n" - "vtrn.16 d5, d7\n" - "vtrn.16 d4, d6\n" + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" - "vtrn.32 d1, d5\n" - "vtrn.32 d0, d4\n" - "vtrn.32 d3, d7\n" - "vtrn.32 d2, d6\n" + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" - "vrev16.8 q0, q0\n" - "vrev16.8 q1, q1\n" - "vrev16.8 q2, q2\n" - "vrev16.8 q3, q3\n" + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" - "mov r9, %2\n" + "mov r9, %2 \n" - "vst1.8 {d1}, [r9], %3\n" - "vst1.8 {d0}, [r9], %3\n" - "vst1.8 {d3}, [r9], %3\n" - "vst1.8 {d2}, [r9], %3\n" - "vst1.8 {d5}, [r9], %3\n" - "vst1.8 {d4}, [r9], %3\n" - "vst1.8 {d7}, [r9], %3\n" - "vst1.8 {d6}, [r9]\n" + "vst1.8 {d1}, [r9], %3 \n" + "vst1.8 {d0}, [r9], %3 \n" + "vst1.8 {d3}, [r9], %3 \n" + "vst1.8 {d2}, [r9], %3 \n" + "vst1.8 {d5}, [r9], %3 \n" + "vst1.8 {d4}, [r9], %3 \n" + "vst1.8 {d7}, [r9], %3 \n" + "vst1.8 {d6}, [r9] \n" - "add %0, #8\n" // src += 8 - "add %2, %2, %3, lsl #3\n" // dst += 8 * dst_stride - "subs %4, #8\n" // w -= 8 - "bge 1b\n" + "add %0, #8 \n" // src += 8 + "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride + "subs %4, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %4, #8\n" - "beq 4f\n" + "adds %4, #8 \n" + "beq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %4, #2\n" - "blt 3f\n" + "cmp %4, #2 \n" + "blt 3f \n" - "cmp %4, #4\n" - "blt 2f\n" + "cmp %4, #4 \n" + "blt 2f \n" // 4x8 block - "mov r9, %0\n" - "vld1.32 {d0[0]}, [r9], %1\n" - "vld1.32 {d0[1]}, [r9], %1\n" - "vld1.32 {d1[0]}, [r9], %1\n" - "vld1.32 {d1[1]}, [r9], %1\n" - "vld1.32 {d2[0]}, [r9], %1\n" - "vld1.32 {d2[1]}, [r9], %1\n" - "vld1.32 {d3[0]}, [r9], %1\n" - "vld1.32 {d3[1]}, [r9]\n" + "mov r9, %0 \n" + "vld1.32 {d0[0]}, [r9], %1 \n" + "vld1.32 {d0[1]}, [r9], %1 \n" + "vld1.32 {d1[0]}, [r9], %1 \n" + "vld1.32 {d1[1]}, [r9], %1 \n" + "vld1.32 {d2[0]}, [r9], %1 \n" + "vld1.32 {d2[1]}, [r9], %1 \n" + "vld1.32 {d3[0]}, [r9], %1 \n" + "vld1.32 {d3[1]}, [r9] \n" - "mov r9, %2\n" + "mov r9, %2 \n" - "vld1.8 {q3}, [%5]\n" + "vld1.8 {q3}, [%5] \n" - "vtbl.8 d4, {d0, d1}, d6\n" - "vtbl.8 d5, {d0, d1}, d7\n" - "vtbl.8 d0, {d2, d3}, d6\n" - "vtbl.8 d1, {d2, d3}, d7\n" + "vtbl.8 d4, {d0, d1}, d6 \n" + "vtbl.8 d5, {d0, d1}, d7 \n" + "vtbl.8 d0, {d2, d3}, d6 \n" + "vtbl.8 d1, {d2, d3}, d7 \n" // TODO: rework shuffle above to write // out with 4 instead of 8 writes - "vst1.32 {d4[0]}, [r9], %3\n" - "vst1.32 {d4[1]}, [r9], %3\n" - "vst1.32 {d5[0]}, [r9], %3\n" - "vst1.32 {d5[1]}, [r9]\n" + "vst1.32 {d4[0]}, [r9], %3 \n" + "vst1.32 {d4[1]}, [r9], %3 \n" + "vst1.32 {d5[0]}, [r9], %3 \n" + "vst1.32 {d5[1]}, [r9] \n" - "add r9, %2, #4\n" - "vst1.32 {d0[0]}, [r9], %3\n" - "vst1.32 {d0[1]}, [r9], %3\n" - "vst1.32 {d1[0]}, [r9], %3\n" - "vst1.32 {d1[1]}, [r9]\n" + "add r9, %2, #4 \n" + "vst1.32 {d0[0]}, [r9], %3 \n" + "vst1.32 {d0[1]}, [r9], %3 \n" + "vst1.32 {d1[0]}, [r9], %3 \n" + "vst1.32 {d1[1]}, [r9] \n" - "add %0, #4\n" // src += 4 - "add %2, %2, %3, lsl #2\n" // dst += 4 * dst_stride - "subs %4, #4\n" // w -= 4 - "beq 4f\n" + "add %0, #4 \n" // src += 4 + "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride + "subs %4, #4 \n" // w -= 4 + "beq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less - "cmp %4, #2\n" - "blt 3f\n" + "cmp %4, #2 \n" + "blt 3f \n" // 2x8 block - "2:\n" - "mov r9, %0\n" - "vld1.16 {d0[0]}, [r9], %1\n" - "vld1.16 {d1[0]}, [r9], %1\n" - "vld1.16 {d0[1]}, [r9], %1\n" - "vld1.16 {d1[1]}, [r9], %1\n" - "vld1.16 {d0[2]}, [r9], %1\n" - "vld1.16 {d1[2]}, [r9], %1\n" - "vld1.16 {d0[3]}, [r9], %1\n" - "vld1.16 {d1[3]}, [r9]\n" + "2: \n" + "mov r9, %0 \n" + "vld1.16 {d0[0]}, [r9], %1 \n" + "vld1.16 {d1[0]}, [r9], %1 \n" + "vld1.16 {d0[1]}, [r9], %1 \n" + "vld1.16 {d1[1]}, [r9], %1 \n" + "vld1.16 {d0[2]}, [r9], %1 \n" + "vld1.16 {d1[2]}, [r9], %1 \n" + "vld1.16 {d0[3]}, [r9], %1 \n" + "vld1.16 {d1[3]}, [r9] \n" - "vtrn.8 d0, d1\n" + "vtrn.8 d0, d1 \n" - "mov r9, %2\n" + "mov r9, %2 \n" - "vst1.64 {d0}, [r9], %3\n" - "vst1.64 {d1}, [r9]\n" + "vst1.64 {d0}, [r9], %3 \n" + "vst1.64 {d1}, [r9] \n" - "add %0, #2\n" // src += 2 - "add %2, %2, %3, lsl #1\n" // dst += 2 * dst_stride - "subs %4, #2\n" // w -= 2 - "beq 4f\n" + "add %0, #2 \n" // src += 2 + "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride + "subs %4, #2 \n" // w -= 2 + "beq 4f \n" // 1x8 block - "3:\n" - "vld1.8 {d0[0]}, [%0], %1\n" - "vld1.8 {d0[1]}, [%0], %1\n" - "vld1.8 {d0[2]}, [%0], %1\n" - "vld1.8 {d0[3]}, [%0], %1\n" - "vld1.8 {d0[4]}, [%0], %1\n" - "vld1.8 {d0[5]}, [%0], %1\n" - "vld1.8 {d0[6]}, [%0], %1\n" - "vld1.8 {d0[7]}, [%0]\n" + "3: \n" + "vld1.8 {d0[0]}, [%0], %1 \n" + "vld1.8 {d0[1]}, [%0], %1 \n" + "vld1.8 {d0[2]}, [%0], %1 \n" + "vld1.8 {d0[3]}, [%0], %1 \n" + "vld1.8 {d0[4]}, [%0], %1 \n" + "vld1.8 {d0[5]}, [%0], %1 \n" + "vld1.8 {d0[6]}, [%0], %1 \n" + "vld1.8 {d0[7]}, [%0] \n" - "vst1.64 {d0}, [%2]\n" + "vst1.64 {d0}, [%2] \n" - "4:\n" + "4: \n" : "+r"(src), // %0 "+r"(src_stride), // %1 @@ -270,68 +270,68 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, void ReverseLineUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { - asm volatile( + asm volatile ( // compute where to start writing destination - "add %1, %3\n" // dst_a + width - "add %2, %3\n" // dst_b + width + "add %1, %3 \n" // dst_a + width + "add %2, %3 \n" // dst_b + width // work on input segments that are multiples of 16, but // width that has been passed is output segments, half // the size of input. - "lsrs r12, %3, #3\n" + "lsrs r12, %3, #3 \n" - "beq 2f\n" + "beq 2f \n" // the output is written in to two blocks. - "mov r12, #-8\n" + "mov r12, #-8 \n" // back of destination by the size of the register that is // going to be reversed - "sub %1, #8\n" - "sub %2, #8\n" + "sub %1, #8 \n" + "sub %2, #8 \n" // the loop needs to run on blocks of 8. what will be left // over is either a negative number, the residuals that need // to be done, or 0. if this isn't subtracted off here the // loop will run one extra time. - "sub %3, #8\n" + "sub %3, #8 \n" - "1:\n" - "vld2.8 {d0, d1}, [%0]!\n" // src += 16 + "1: \n" + "vld2.8 {d0, d1}, [%0]! \n" // src += 16 // reverse the bytes in the 64 bit segments - "vrev64.8 q0, q0\n" + "vrev64.8 q0, q0 \n" - "vst1.8 {d0}, [%1], r12\n" // dst_a -= 8 - "vst1.8 {d1}, [%2], r12\n" // dst_b -= 8 + "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 + "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 - "subs %3, #8\n" - "bge 1b\n" + "subs %3, #8 \n" + "bge 1b \n" // add 8 back to the counter. if the result is 0 there is no // residuals so return - "adds %3, #8\n" - "beq 4f\n" + "adds %3, #8 \n" + "beq 4f \n" - "add %1, #8\n" - "add %2, #8\n" + "add %1, #8 \n" + "add %2, #8 \n" - "2:\n" + "2: \n" - "mov r12, #-1\n" + "mov r12, #-1 \n" - "sub %1, #1\n" - "sub %2, #1\n" + "sub %1, #1 \n" + "sub %2, #1 \n" - "3:\n" - "vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2 + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 - "vst1.8 {d0[0]}, [%1], r12\n" // dst_a -= 1 - "vst1.8 {d1[0]}, [%2], r12\n" // dst_b -= 1 + "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 + "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 - "subs %3, %3, #1\n" - "bgt 3b\n" - "4:\n" + "subs %3, %3, #1 \n" + "bgt 3b \n" + "4: \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 @@ -348,198 +348,198 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) { - asm volatile( + asm volatile ( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %6, #8\n" + "sub %6, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - "1:\n" - "mov r9, %0\n" + "1: \n" + "mov r9, %0 \n" - "vld2.8 {d0, d1}, [r9], %1\n" - "vld2.8 {d2, d3}, [r9], %1\n" - "vld2.8 {d4, d5}, [r9], %1\n" - "vld2.8 {d6, d7}, [r9], %1\n" - "vld2.8 {d16, d17}, [r9], %1\n" - "vld2.8 {d18, d19}, [r9], %1\n" - "vld2.8 {d20, d21}, [r9], %1\n" - "vld2.8 {d22, d23}, [r9]\n" + "vld2.8 {d0, d1}, [r9], %1 \n" + "vld2.8 {d2, d3}, [r9], %1 \n" + "vld2.8 {d4, d5}, [r9], %1 \n" + "vld2.8 {d6, d7}, [r9], %1 \n" + "vld2.8 {d16, d17}, [r9], %1 \n" + "vld2.8 {d18, d19}, [r9], %1 \n" + "vld2.8 {d20, d21}, [r9], %1 \n" + "vld2.8 {d22, d23}, [r9] \n" - "vtrn.8 q1, q0\n" - "vtrn.8 q3, q2\n" - "vtrn.8 q9, q8\n" - "vtrn.8 q11, q10\n" + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" - "vtrn.16 q1, q3\n" - "vtrn.16 q0, q2\n" - "vtrn.16 q9, q11\n" - "vtrn.16 q8, q10\n" + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" - "vtrn.32 q1, q9\n" - "vtrn.32 q0, q8\n" - "vtrn.32 q3, q11\n" - "vtrn.32 q2, q10\n" + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" - "vrev16.8 q0, q0\n" - "vrev16.8 q1, q1\n" - "vrev16.8 q2, q2\n" - "vrev16.8 q3, q3\n" - "vrev16.8 q8, q8\n" - "vrev16.8 q9, q9\n" - "vrev16.8 q10, q10\n" - "vrev16.8 q11, q11\n" + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" - "mov r9, %2\n" + "mov r9, %2 \n" - "vst1.8 {d2}, [r9], %3\n" - "vst1.8 {d0}, [r9], %3\n" - "vst1.8 {d6}, [r9], %3\n" - "vst1.8 {d4}, [r9], %3\n" - "vst1.8 {d18}, [r9], %3\n" - "vst1.8 {d16}, [r9], %3\n" - "vst1.8 {d22}, [r9], %3\n" - "vst1.8 {d20}, [r9]\n" + "vst1.8 {d2}, [r9], %3 \n" + "vst1.8 {d0}, [r9], %3 \n" + "vst1.8 {d6}, [r9], %3 \n" + "vst1.8 {d4}, [r9], %3 \n" + "vst1.8 {d18}, [r9], %3 \n" + "vst1.8 {d16}, [r9], %3 \n" + "vst1.8 {d22}, [r9], %3 \n" + "vst1.8 {d20}, [r9] \n" - "mov r9, %4\n" + "mov r9, %4 \n" - "vst1.8 {d3}, [r9], %5\n" - "vst1.8 {d1}, [r9], %5\n" - "vst1.8 {d7}, [r9], %5\n" - "vst1.8 {d5}, [r9], %5\n" - "vst1.8 {d19}, [r9], %5\n" - "vst1.8 {d17}, [r9], %5\n" - "vst1.8 {d23}, [r9], %5\n" - "vst1.8 {d21}, [r9]\n" + "vst1.8 {d3}, [r9], %5 \n" + "vst1.8 {d1}, [r9], %5 \n" + "vst1.8 {d7}, [r9], %5 \n" + "vst1.8 {d5}, [r9], %5 \n" + "vst1.8 {d19}, [r9], %5 \n" + "vst1.8 {d17}, [r9], %5 \n" + "vst1.8 {d23}, [r9], %5 \n" + "vst1.8 {d21}, [r9] \n" - "add %0, #8*2\n" // src += 8*2 - "add %2, %2, %3, lsl #3\n" // dst_a += 8 * dst_stride_a - "add %4, %4, %5, lsl #3\n" // dst_b += 8 * dst_stride_b - "subs %6, #8\n" // w -= 8 - "bge 1b\n" + "add %0, #8*2 \n" // src += 8*2 + "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a + "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b + "subs %6, #8 \n" // w -= 8 + "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. - "adds %6, #8\n" - "beq 4f\n" + "adds %6, #8 \n" + "beq 4f \n" // some residual, so between 1 and 7 lines left to transpose - "cmp %6, #2\n" - "blt 3f\n" + "cmp %6, #2 \n" + "blt 3f \n" - "cmp %6, #4\n" - "blt 2f\n" + "cmp %6, #4 \n" + "blt 2f \n" //TODO(frkoenig) : clean this up // 4x8 block - "mov r9, %0\n" - "vld1.64 {d0}, [r9], %1\n" - "vld1.64 {d1}, [r9], %1\n" - "vld1.64 {d2}, [r9], %1\n" - "vld1.64 {d3}, [r9], %1\n" - "vld1.64 {d4}, [r9], %1\n" - "vld1.64 {d5}, [r9], %1\n" - "vld1.64 {d6}, [r9], %1\n" - "vld1.64 {d7}, [r9]\n" + "mov r9, %0 \n" + "vld1.64 {d0}, [r9], %1 \n" + "vld1.64 {d1}, [r9], %1 \n" + "vld1.64 {d2}, [r9], %1 \n" + "vld1.64 {d3}, [r9], %1 \n" + "vld1.64 {d4}, [r9], %1 \n" + "vld1.64 {d5}, [r9], %1 \n" + "vld1.64 {d6}, [r9], %1 \n" + "vld1.64 {d7}, [r9] \n" - "vld1.8 {q15}, [%7]\n" + "vld1.8 {q15}, [%7] \n" - "vtrn.8 q0, q1\n" - "vtrn.8 q2, q3\n" + "vtrn.8 q0, q1 \n" + "vtrn.8 q2, q3 \n" - "vtbl.8 d16, {d0, d1}, d30\n" - "vtbl.8 d17, {d0, d1}, d31\n" - "vtbl.8 d18, {d2, d3}, d30\n" - "vtbl.8 d19, {d2, d3}, d31\n" - "vtbl.8 d20, {d4, d5}, d30\n" - "vtbl.8 d21, {d4, d5}, d31\n" - "vtbl.8 d22, {d6, d7}, d30\n" - "vtbl.8 d23, {d6, d7}, d31\n" + "vtbl.8 d16, {d0, d1}, d30 \n" + "vtbl.8 d17, {d0, d1}, d31 \n" + "vtbl.8 d18, {d2, d3}, d30 \n" + "vtbl.8 d19, {d2, d3}, d31 \n" + "vtbl.8 d20, {d4, d5}, d30 \n" + "vtbl.8 d21, {d4, d5}, d31 \n" + "vtbl.8 d22, {d6, d7}, d30 \n" + "vtbl.8 d23, {d6, d7}, d31 \n" - "mov r9, %2\n" + "mov r9, %2 \n" - "vst1.32 {d16[0]}, [r9], %3\n" - "vst1.32 {d16[1]}, [r9], %3\n" - "vst1.32 {d17[0]}, [r9], %3\n" - "vst1.32 {d17[1]}, [r9], %3\n" + "vst1.32 {d16[0]}, [r9], %3 \n" + "vst1.32 {d16[1]}, [r9], %3 \n" + "vst1.32 {d17[0]}, [r9], %3 \n" + "vst1.32 {d17[1]}, [r9], %3 \n" - "add r9, %2, #4\n" - "vst1.32 {d20[0]}, [r9], %3\n" - "vst1.32 {d20[1]}, [r9], %3\n" - "vst1.32 {d21[0]}, [r9], %3\n" - "vst1.32 {d21[1]}, [r9]\n" + "add r9, %2, #4 \n" + "vst1.32 {d20[0]}, [r9], %3 \n" + "vst1.32 {d20[1]}, [r9], %3 \n" + "vst1.32 {d21[0]}, [r9], %3 \n" + "vst1.32 {d21[1]}, [r9] \n" - "mov r9, %4\n" + "mov r9, %4 \n" - "vst1.32 {d18[0]}, [r9], %5\n" - "vst1.32 {d18[1]}, [r9], %5\n" - "vst1.32 {d19[0]}, [r9], %5\n" - "vst1.32 {d19[1]}, [r9], %5\n" + "vst1.32 {d18[0]}, [r9], %5 \n" + "vst1.32 {d18[1]}, [r9], %5 \n" + "vst1.32 {d19[0]}, [r9], %5 \n" + "vst1.32 {d19[1]}, [r9], %5 \n" - "add r9, %4, #4\n" - "vst1.32 {d22[0]}, [r9], %5\n" - "vst1.32 {d22[1]}, [r9], %5\n" - "vst1.32 {d23[0]}, [r9], %5\n" - "vst1.32 {d23[1]}, [r9]\n" + "add r9, %4, #4 \n" + "vst1.32 {d22[0]}, [r9], %5 \n" + "vst1.32 {d22[1]}, [r9], %5 \n" + "vst1.32 {d23[0]}, [r9], %5 \n" + "vst1.32 {d23[1]}, [r9] \n" - "add %0, #4*2\n" // src += 4 * 2 - "add %2, %2, %3, lsl #2\n" // dst_a += 4 * dst_stride_a - "add %4, %4, %5, lsl #2\n" // dst_b += 4 * dst_stride_b - "subs %6, #4\n" // w -= 4 - "beq 4f\n" + "add %0, #4*2 \n" // src += 4 * 2 + "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a + "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b + "subs %6, #4 \n" // w -= 4 + "beq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less - "cmp %6, #2\n" - "blt 3f\n" + "cmp %6, #2 \n" + "blt 3f \n" // 2x8 block - "2:\n" - "mov r9, %0\n" - "vld2.16 {d0[0], d2[0]}, [r9], %1\n" - "vld2.16 {d1[0], d3[0]}, [r9], %1\n" - "vld2.16 {d0[1], d2[1]}, [r9], %1\n" - "vld2.16 {d1[1], d3[1]}, [r9], %1\n" - "vld2.16 {d0[2], d2[2]}, [r9], %1\n" - "vld2.16 {d1[2], d3[2]}, [r9], %1\n" - "vld2.16 {d0[3], d2[3]}, [r9], %1\n" - "vld2.16 {d1[3], d3[3]}, [r9]\n" + "2: \n" + "mov r9, %0 \n" + "vld2.16 {d0[0], d2[0]}, [r9], %1 \n" + "vld2.16 {d1[0], d3[0]}, [r9], %1 \n" + "vld2.16 {d0[1], d2[1]}, [r9], %1 \n" + "vld2.16 {d1[1], d3[1]}, [r9], %1 \n" + "vld2.16 {d0[2], d2[2]}, [r9], %1 \n" + "vld2.16 {d1[2], d3[2]}, [r9], %1 \n" + "vld2.16 {d0[3], d2[3]}, [r9], %1 \n" + "vld2.16 {d1[3], d3[3]}, [r9] \n" - "vtrn.8 d0, d1\n" - "vtrn.8 d2, d3\n" + "vtrn.8 d0, d1 \n" + "vtrn.8 d2, d3 \n" - "mov r9, %2\n" + "mov r9, %2 \n" - "vst1.64 {d0}, [r9], %3\n" - "vst1.64 {d2}, [r9]\n" + "vst1.64 {d0}, [r9], %3 \n" + "vst1.64 {d2}, [r9] \n" - "mov r9, %4\n" + "mov r9, %4 \n" - "vst1.64 {d1}, [r9], %5\n" - "vst1.64 {d3}, [r9]\n" + "vst1.64 {d1}, [r9], %5 \n" + "vst1.64 {d3}, [r9] \n" - "add %0, #2*2\n" // src += 2 * 2 - "add %2, %2, %3, lsl #1\n" // dst_a += 2 * dst_stride_a - "add %4, %4, %5, lsl #1\n" // dst_b += 2 * dst_stride_b - "subs %6, #2\n" // w -= 2 - "beq 4f\n" + "add %0, #2*2 \n" // src += 2 * 2 + "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a + "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b + "subs %6, #2 \n" // w -= 2 + "beq 4f \n" // 1x8 block - "3:\n" - "vld2.8 {d0[0], d1[0]}, [%0], %1\n" - "vld2.8 {d0[1], d1[1]}, [%0], %1\n" - "vld2.8 {d0[2], d1[2]}, [%0], %1\n" - "vld2.8 {d0[3], d1[3]}, [%0], %1\n" - "vld2.8 {d0[4], d1[4]}, [%0], %1\n" - "vld2.8 {d0[5], d1[5]}, [%0], %1\n" - "vld2.8 {d0[6], d1[6]}, [%0], %1\n" - "vld2.8 {d0[7], d1[7]}, [%0]\n" + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0], %1 \n" + "vld2.8 {d0[1], d1[1]}, [%0], %1 \n" + "vld2.8 {d0[2], d1[2]}, [%0], %1 \n" + "vld2.8 {d0[3], d1[3]}, [%0], %1 \n" + "vld2.8 {d0[4], d1[4]}, [%0], %1 \n" + "vld2.8 {d0[5], d1[5]}, [%0], %1 \n" + "vld2.8 {d0[6], d1[6]}, [%0], %1 \n" + "vld2.8 {d0[7], d1[7]}, [%0] \n" - "vst1.64 {d0}, [%2]\n" - "vst1.64 {d1}, [%4]\n" + "vst1.64 {d0}, [%2] \n" + "vst1.64 {d1}, [%4] \n" - "4:\n" + "4: \n" : "+r"(src), // %0 "+r"(src_stride), // %1 diff --git a/source/row_posix.cc b/source/row_posix.cc index f355122f6..30ce811fe 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -59,23 +59,23 @@ static const uvec8 kShuffleMaskBGRAToARGB = { }; void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "pslld $0x18,%%xmm5\n" -"1:" - "movq (%0),%%xmm0\n" - "lea 0x8(%0),%0\n" - "punpcklbw %%xmm0,%%xmm0\n" - "movdqa %%xmm0,%%xmm1\n" - "punpcklwd %%xmm0,%%xmm0\n" - "punpckhwd %%xmm1,%%xmm1\n" - "por %%xmm5,%%xmm0\n" - "por %%xmm5,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "movdqa %%xmm1,0x10(%1)\n" - "lea 0x20(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" +"1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "movdqa %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -88,16 +88,16 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { - asm volatile( - "movdqa %3,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm5,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + asm volatile ( + "movdqa %3,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -111,16 +111,16 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { } void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { - asm volatile( - "movdqa %3,%%xmm5\n" -"1:" - "movdqa (%0),%%xmm0\n" - "lea 0x10(%0),%0\n" - "pshufb %%xmm5,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + asm volatile ( + "movdqa %3,%%xmm5 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -133,34 +133,34 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { } void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5\n" - "movdqa %3,%%xmm4\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm3\n" - "lea 0x30(%0),%0\n" - "movdqa %%xmm3,%%xmm2\n" - "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm4,%%xmm2\n" - "por %%xmm5,%%xmm2\n" - "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm4,%%xmm0\n" - "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm5,%%xmm0\n" - "pshufb %%xmm4,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "por %%xmm5,%%xmm1\n" - "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm4,%%xmm3\n" - "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm5,%%xmm3\n" - "movdqa %%xmm3,0x30(%1)\n" - "lea 0x40(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_bg24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -173,34 +173,34 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { } void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000 - "pslld $0x18,%%xmm5\n" - "movdqa %3,%%xmm4\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm3\n" - "lea 0x30(%0),%0\n" - "movdqa %%xmm3,%%xmm2\n" - "palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] } - "pshufb %%xmm4,%%xmm2\n" - "por %%xmm5,%%xmm2\n" - "palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] } - "pshufb %%xmm4,%%xmm0\n" - "movdqa %%xmm2,0x20(%1)\n" - "por %%xmm5,%%xmm0\n" - "pshufb %%xmm4,%%xmm1\n" - "movdqa %%xmm0,(%1)\n" - "por %%xmm5,%%xmm1\n" - "palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] } - "pshufb %%xmm4,%%xmm3\n" - "movdqa %%xmm1,0x10(%1)\n" - "por %%xmm5,%%xmm3\n" - "movdqa %%xmm3,0x30(%1)\n" - "lea 0x40(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 + "pslld $0x18,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm3,%%xmm2 \n" + "palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "pshufb %%xmm4,%%xmm2 \n" + "por %%xmm5,%%xmm2 \n" + "palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,0x20(%1) \n" + "por %%xmm5,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "movdqa %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] } + "pshufb %%xmm4,%%xmm3 \n" + "movdqa %%xmm1,0x10(%1) \n" + "por %%xmm5,%%xmm3 \n" + "movdqa %%xmm3,0x30(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 @@ -213,29 +213,29 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { } void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - asm volatile( - "movdqa %4,%%xmm5\n" - "movdqa %3,%%xmm4\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm2\n" - "movdqa 0x30(%0),%%xmm3\n" - "pmaddubsw %%xmm4,%%xmm0\n" - "pmaddubsw %%xmm4,%%xmm1\n" - "pmaddubsw %%xmm4,%%xmm2\n" - "pmaddubsw %%xmm4,%%xmm3\n" - "lea 0x40(%0),%0\n" - "phaddw %%xmm1,%%xmm0\n" - "phaddw %%xmm3,%%xmm2\n" - "psrlw $0x7,%%xmm0\n" - "psrlw $0x7,%%xmm2\n" - "packuswb %%xmm2,%%xmm0\n" - "paddb %%xmm5,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 @@ -253,10 +253,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #ifdef HAS_ARGBTOUVROW_SSSE3 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { - asm volatile( - "movdqa %0,%%xmm4\n" - "movdqa %1,%%xmm3\n" - "movdqa %2,%%xmm5\n" + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" : : "m"(kARGBToU), // %0 "m"(kARGBToV), // %1 @@ -266,43 +266,43 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, "xmm3", "xmm4", "xmm5" #endif ); - asm volatile( - "sub %1,%2\n" -"1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa 0x20(%0),%%xmm2\n" - "movdqa 0x30(%0),%%xmm6\n" - "pavgb (%0,%4,1),%%xmm0\n" - "pavgb 0x10(%0,%4,1),%%xmm1\n" - "pavgb 0x20(%0,%4,1),%%xmm2\n" - "pavgb 0x30(%0,%4,1),%%xmm6\n" - "lea 0x40(%0),%0\n" - "movdqa %%xmm0,%%xmm7\n" - "shufps $0x88,%%xmm1,%%xmm0\n" - "shufps $0xdd,%%xmm1,%%xmm7\n" - "pavgb %%xmm7,%%xmm0\n" - "movdqa %%xmm2,%%xmm7\n" - "shufps $0x88,%%xmm6,%%xmm2\n" - "shufps $0xdd,%%xmm6,%%xmm7\n" - "pavgb %%xmm7,%%xmm2\n" - "movdqa %%xmm0,%%xmm1\n" - "movdqa %%xmm2,%%xmm6\n" - "pmaddubsw %%xmm4,%%xmm0\n" - "pmaddubsw %%xmm4,%%xmm2\n" - "pmaddubsw %%xmm3,%%xmm1\n" - "pmaddubsw %%xmm3,%%xmm6\n" - "phaddw %%xmm2,%%xmm0\n" - "phaddw %%xmm6,%%xmm1\n" - "psraw $0x8,%%xmm0\n" - "psraw $0x8,%%xmm1\n" - "packsswb %%xmm1,%%xmm0\n" - "paddb %%xmm5,%%xmm0\n" - "movlps %%xmm0,(%1)\n" - "movhps %%xmm0,(%1,%2,1)\n" - "lea 0x8(%1),%1\n" - "sub $0x10,%3\n" - "ja 1b\n" + asm volatile ( + "sub %1,%2 \n" +"1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" : "+r"(src_argb0), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 @@ -332,98 +332,65 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #define OMITFP __attribute__((optimize("omit-frame-pointer"))) #endif -#if defined(__APPLE__) -// REG6 version uses 1 less register but is slower -#define REG6 -#endif - -#ifdef REG6 -// 6 register version only has REG_a for temporary -#define CLOBBER "%"REG_a -#define YUVTORGB \ - "1:" \ - "movzb (%1),%%"REG_a"\n" \ - "lea 1(%1),%1\n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ - "movzb (%2),%%"REG_a"\n" \ - "lea 1(%2),%2\n" \ - "movq 4096(%5,%%"REG_a",8),%%xmm1\n" \ - "paddsw %%xmm1,%%xmm0\n" \ - "movzb (%0),%%"REG_a"\n" \ - "movq 0(%5,%%"REG_a",8),%%xmm2\n" \ - "movzb 0x1(%0),%%"REG_a"\n" \ - "movq 0(%5,%%"REG_a",8),%%xmm3\n" \ - "lea 2(%0),%0\n" \ - "paddsw %%xmm0,%%xmm2\n" \ - "paddsw %%xmm0,%%xmm3\n" \ - "shufps $0x44,%%xmm3,%%xmm2\n" \ - "psraw $0x6,%%xmm2\n" \ - "packuswb %%xmm2,%%xmm2\n" \ - "movq %%xmm2,0x0(%3)\n" \ - "lea 8(%3),%3\n" \ - "sub $0x2,%4\n" \ - "ja 1b\n" -#else #define CLOBBER "%"REG_a, "%"REG_d // This version produces 2 pixels #define YUVTORGB \ -"1:" \ - "movzb (%1),%%"REG_a"\n" \ - "lea 1(%1),%1\n" \ - "movzb (%2),%%"REG_d"\n" \ - "lea 1(%2),%2\n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ - "movzb 0(%0),%%"REG_a"\n" \ - "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \ - "paddsw %%xmm1,%%xmm0\n" \ - "movzb 1(%0),%%"REG_d"\n" \ - "punpcklqdq %%xmm0,%%xmm0\n" \ - "lea 2(%0),%0\n" \ - "movq 0(%5,%%"REG_a",8),%%xmm1\n" \ - "movhps 0(%5,%%"REG_d",8),%%xmm1\n" \ - "paddsw %%xmm0,%%xmm1\n" \ - "psraw $6,%%xmm1\n" \ - "packuswb %%xmm1,%%xmm1\n" \ - "movq %%xmm1,0(%3)\n" \ - "lea 8(%3),%3\n" \ - "sub $0x2,%4\n" \ - "ja 1b\n" +"1: \n" \ + "movzb (%1),%%"REG_a" \n" \ + "lea 1(%1),%1 \n" \ + "movzb (%2),%%"REG_d" \n" \ + "lea 1(%2),%2 \n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ + "movzb 0(%0),%%"REG_a" \n" \ + "movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ + "paddsw %%xmm1,%%xmm0 \n" \ + "movzb 1(%0),%%"REG_d" \n" \ + "punpcklqdq %%xmm0,%%xmm0 \n" \ + "lea 2(%0),%0 \n" \ + "movq 0(%5,%%"REG_a",8),%%xmm1 \n" \ + "movhps 0(%5,%%"REG_d",8),%%xmm1 \n" \ + "paddsw %%xmm0,%%xmm1 \n" \ + "psraw $6,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "movq %%xmm1,0(%3) \n" \ + "lea 8(%3),%3 \n" \ + "sub $0x2,%4 \n" \ + "ja 1b \n" // This version produces 4 pixels #define YUVTORGB4 \ -"1:" \ - "movzb 0(%1),%%"REG_a"\n" \ - "movzb 0(%2),%%"REG_d"\n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ - "movzb 0(%0),%%"REG_a"\n" \ - "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \ - "paddsw %%xmm1,%%xmm0\n" \ - "movzb 1(%0),%%"REG_d"\n" \ - "punpcklqdq %%xmm0,%%xmm0\n" \ - "movq 0(%5,%%"REG_a",8),%%xmm2\n" \ - "movhps 0(%5,%%"REG_d",8),%%xmm2\n" \ - "paddsw %%xmm0,%%xmm2\n" \ - "psraw $6,%%xmm2\n" \ - "movzb 1(%1),%%"REG_a"\n" \ - "movzb 1(%2),%%"REG_d"\n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0\n" \ - "movzb 2(%0),%%"REG_a"\n" \ - "movq 4096(%5,%%"REG_d",8),%%xmm1\n" \ - "paddsw %%xmm1,%%xmm0\n" \ - "movzb 3(%0),%%"REG_d"\n" \ - "punpcklqdq %%xmm0,%%xmm0\n" \ - "movq 0(%5,%%"REG_a",8),%%xmm3\n" \ - "movhps 0(%5,%%"REG_d",8),%%xmm3\n" \ - "paddsw %%xmm0,%%xmm3\n" \ - "psraw $6,%%xmm3\n" \ - "lea 2(%1),%1\n" \ - "lea 2(%2),%2\n" \ - "lea 4(%0),%0\n" \ - "packuswb %%xmm3,%%xmm2\n" \ - "movdqa %%xmm2,0(%3)\n" \ - "lea 16(%3),%3\n" \ - "sub $0x4,%4\n" \ - "ja 1b\n" -#endif +"1: \n" \ + "movzb 0(%1),%%"REG_a" \n" \ + "movzb 0(%2),%%"REG_d" \n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ + "movzb 0(%0),%%"REG_a" \n" \ + "movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ + "paddsw %%xmm1,%%xmm0 \n" \ + "movzb 1(%0),%%"REG_d" \n" \ + "punpcklqdq %%xmm0,%%xmm0 \n" \ + "movq 0(%5,%%"REG_a",8),%%xmm2 \n" \ + "movhps 0(%5,%%"REG_d",8),%%xmm2 \n" \ + "paddsw %%xmm0,%%xmm2 \n" \ + "psraw $6,%%xmm2 \n" \ + "movzb 1(%1),%%"REG_a" \n" \ + "movzb 1(%2),%%"REG_d" \n" \ + "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ + "movzb 2(%0),%%"REG_a" \n" \ + "movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ + "paddsw %%xmm1,%%xmm0 \n" \ + "movzb 3(%0),%%"REG_d" \n" \ + "punpcklqdq %%xmm0,%%xmm0 \n" \ + "movq 0(%5,%%"REG_a",8),%%xmm3 \n" \ + "movhps 0(%5,%%"REG_d",8),%%xmm3 \n" \ + "paddsw %%xmm0,%%xmm3 \n" \ + "psraw $6,%%xmm3 \n" \ + "lea 2(%1),%1 \n" \ + "lea 2(%2),%2 \n" \ + "lea 4(%0),%0 \n" \ + "packuswb %%xmm3,%%xmm2 \n" \ + "movdqa %%xmm2,0(%3) \n" \ + "lea 16(%3),%3 \n" \ + "sub $0x4,%4 \n" \ + "ja 1b \n" \ // 6 or 7 registers void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi @@ -431,7 +398,7 @@ void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 - asm volatile( + asm volatile ( YUVTORGB : "+r"(y_buf), // %0 "+r"(u_buf), // %1 @@ -452,7 +419,7 @@ void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 - asm volatile( + asm volatile ( YUVTORGB4 : "+r"(y_buf), // %0 "+r"(u_buf), // %1 @@ -472,7 +439,7 @@ void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 - asm volatile( + asm volatile ( YUVTORGB : "+r"(y_buf), // %0 "+r"(u_buf), // %1 @@ -492,7 +459,7 @@ void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 - asm volatile( + asm volatile ( YUVTORGB : "+r"(y_buf), // %0 "+r"(u_buf), // %1 @@ -513,26 +480,26 @@ void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 - asm volatile( -"1:" - "movzb (%1),%%"REG_a"\n" - "lea 1(%1),%1\n" - "movq 2048(%5,%%"REG_a",8),%%xmm0\n" - "movzb (%2),%%"REG_a"\n" - "lea 1(%2),%2\n" - "movq 4096(%5,%%"REG_a",8),%%xmm1\n" - "paddsw %%xmm1,%%xmm0\n" - "movzb (%0),%%"REG_a"\n" - "lea 1(%0),%0\n" - "movq 0(%5,%%"REG_a",8),%%xmm2\n" - "paddsw %%xmm0,%%xmm2\n" - "shufps $0x44,%%xmm2,%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movd %%xmm2,0x0(%3)\n" - "lea 4(%3),%3\n" - "sub $0x1,%4\n" - "ja 1b\n" + asm volatile ( +"1: \n" + "movzb (%1),%%"REG_a" \n" + "lea 1(%1),%1 \n" + "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" + "movzb (%2),%%"REG_a" \n" + "lea 1(%2),%2 \n" + "movq 4096(%5,%%"REG_a",8),%%xmm1 \n" + "paddsw %%xmm1,%%xmm0 \n" + "movzb (%0),%%"REG_a" \n" + "lea 1(%0),%0 \n" + "movq 0(%5,%%"REG_a",8),%%xmm2 \n" + "paddsw %%xmm0,%%xmm2 \n" + "shufps $0x44,%%xmm2,%%xmm2 \n" + "psraw $0x6,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,0x0(%3) \n" + "lea 4(%3),%3 \n" + "sub $0x1,%4 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 @@ -550,19 +517,19 @@ void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi uint8* rgb_buf, // rcx int width) { // r8 - asm volatile( -"1:" - "movzb (%0),%%"REG_a"\n" - "movzb 0x1(%0),%%"REG_d"\n" - "movq (%3,%%"REG_a",8),%%xmm2\n" - "lea 2(%0),%0\n" - "movhps (%3,%%"REG_d",8),%%xmm2\n" - "psraw $0x6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movq %%xmm2,0x0(%1)\n" - "lea 8(%1),%1\n" - "sub $0x2,%2\n" - "ja 1b\n" + asm volatile ( +"1: \n" + "movzb (%0),%%"REG_a" \n" + "movzb 0x1(%0),%%"REG_d" \n" + "movq (%3,%%"REG_a",8),%%xmm2 \n" + "lea 2(%0),%0 \n" + "movhps (%3,%%"REG_d",8),%%xmm2 \n" + "psraw $0x6,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movq %%xmm2,0x0(%1) \n" + "lea 8(%1),%1 \n" + "sub $0x2,%2 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 "+rm"(width) // %2 @@ -591,44 +558,44 @@ void FastConvertYUVToARGBRow_MMX(const uint8* y_buf, uint8* rgb_buf, int width); asm( - ".text\n" + ".text \n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToARGBRow_MMX\n" -"_FastConvertYUVToARGBRow_MMX:\n" + ".globl _FastConvertYUVToARGBRow_MMX \n" +"_FastConvertYUVToARGBRow_MMX: \n" #else - ".global FastConvertYUVToARGBRow_MMX\n" -"FastConvertYUVToARGBRow_MMX:\n" + ".global FastConvertYUVToARGBRow_MMX \n" +"FastConvertYUVToARGBRow_MMX: \n" #endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" + "pusha \n" + "mov 0x24(%esp),%edx \n" + "mov 0x28(%esp),%edi \n" + "mov 0x2c(%esp),%esi \n" + "mov 0x30(%esp),%ebp \n" + "mov 0x34(%esp),%ecx \n" -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" +"1: \n" + "movzbl (%edi),%eax \n" + "lea 1(%edi),%edi \n" + "movzbl (%esi),%ebx \n" + "lea 1(%esi),%esi \n" "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" + "movzbl (%edx),%eax \n" "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" + "movzbl 0x1(%edx),%ebx \n" "movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" + "lea 2(%edx),%edx \n" "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "paddsw %mm0,%mm1 \n" + "paddsw %mm0,%mm2 \n" + "psraw $0x6,%mm1 \n" + "psraw $0x6,%mm2 \n" + "packuswb %mm2,%mm1 \n" + "movq %mm1,0x0(%ebp) \n" + "lea 8(%ebp),%ebp \n" + "sub $0x2,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, @@ -637,44 +604,44 @@ void FastConvertYUVToBGRARow_MMX(const uint8* y_buf, uint8* rgb_buf, int width); asm( - ".text\n" + ".text \n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToBGRARow_MMX\n" -"_FastConvertYUVToBGRARow_MMX:\n" + ".globl _FastConvertYUVToBGRARow_MMX \n" +"_FastConvertYUVToBGRARow_MMX: \n" #else - ".global FastConvertYUVToBGRARow_MMX\n" -"FastConvertYUVToBGRARow_MMX:\n" + ".global FastConvertYUVToBGRARow_MMX \n" +"FastConvertYUVToBGRARow_MMX: \n" #endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" + "pusha \n" + "mov 0x24(%esp),%edx \n" + "mov 0x28(%esp),%edi \n" + "mov 0x2c(%esp),%esi \n" + "mov 0x30(%esp),%ebp \n" + "mov 0x34(%esp),%ecx \n" -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" +"1: \n" + "movzbl (%edi),%eax \n" + "lea 1(%edi),%edi \n" + "movzbl (%esi),%ebx \n" + "lea 1(%esi),%esi \n" "movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" + "movzbl (%edx),%eax \n" "paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" + "movzbl 0x1(%edx),%ebx \n" "movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" + "lea 2(%edx),%edx \n" "movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "paddsw %mm0,%mm1 \n" + "paddsw %mm0,%mm2 \n" + "psraw $0x6,%mm1 \n" + "psraw $0x6,%mm2 \n" + "packuswb %mm2,%mm1 \n" + "movq %mm1,0x0(%ebp) \n" + "lea 8(%ebp),%ebp \n" + "sub $0x2,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, @@ -683,44 +650,44 @@ void FastConvertYUVToABGRRow_MMX(const uint8* y_buf, uint8* rgb_buf, int width); asm( - ".text\n" + ".text \n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYUVToABGRRow_MMX\n" -"_FastConvertYUVToABGRRow_MMX:\n" + ".globl _FastConvertYUVToABGRRow_MMX \n" +"_FastConvertYUVToABGRRow_MMX: \n" #else - ".global FastConvertYUVToABGRRow_MMX\n" -"FastConvertYUVToABGRRow_MMX:\n" + ".global FastConvertYUVToABGRRow_MMX \n" +"FastConvertYUVToABGRRow_MMX: \n" #endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" + "pusha \n" + "mov 0x24(%esp),%edx \n" + "mov 0x28(%esp),%edi \n" + "mov 0x2c(%esp),%esi \n" + "mov 0x30(%esp),%ebp \n" + "mov 0x34(%esp),%ecx \n" -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" +"1: \n" + "movzbl (%edi),%eax \n" + "lea 1(%edi),%edi \n" + "movzbl (%esi),%ebx \n" + "lea 1(%esi),%esi \n" "movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" + "movzbl (%edx),%eax \n" "paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n" - "movzbl 0x1(%edx),%ebx\n" + "movzbl 0x1(%edx),%ebx \n" "movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n" - "lea 2(%edx),%edx\n" + "lea 2(%edx),%edx \n" "movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n" - "paddsw %mm0,%mm1\n" - "paddsw %mm0,%mm2\n" - "psraw $0x6,%mm1\n" - "psraw $0x6,%mm2\n" - "packuswb %mm2,%mm1\n" - "movq %mm1,0x0(%ebp)\n" - "lea 8(%ebp),%ebp\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "paddsw %mm0,%mm1 \n" + "paddsw %mm0,%mm2 \n" + "psraw $0x6,%mm1 \n" + "psraw $0x6,%mm2 \n" + "packuswb %mm2,%mm1 \n" + "movq %mm1,0x0(%ebp) \n" + "lea 8(%ebp),%ebp \n" + "sub $0x2,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, @@ -729,73 +696,73 @@ void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf, uint8* rgb_buf, int width); asm( - ".text\n" + ".text \n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYUV444ToARGBRow_MMX\n" -"_FastConvertYUV444ToARGBRow_MMX:\n" + ".globl _FastConvertYUV444ToARGBRow_MMX \n" +"_FastConvertYUV444ToARGBRow_MMX: \n" #else - ".global FastConvertYUV444ToARGBRow_MMX\n" -"FastConvertYUV444ToARGBRow_MMX:\n" + ".global FastConvertYUV444ToARGBRow_MMX \n" +"FastConvertYUV444ToARGBRow_MMX: \n" #endif - "pusha\n" - "mov 0x24(%esp),%edx\n" - "mov 0x28(%esp),%edi\n" - "mov 0x2c(%esp),%esi\n" - "mov 0x30(%esp),%ebp\n" - "mov 0x34(%esp),%ecx\n" + "pusha \n" + "mov 0x24(%esp),%edx \n" + "mov 0x28(%esp),%edi \n" + "mov 0x2c(%esp),%esi \n" + "mov 0x30(%esp),%ebp \n" + "mov 0x34(%esp),%ecx \n" -"1:" - "movzbl (%edi),%eax\n" - "lea 1(%edi),%edi\n" - "movzbl (%esi),%ebx\n" - "lea 1(%esi),%esi\n" +"1: \n" + "movzbl (%edi),%eax \n" + "lea 1(%edi),%edi \n" + "movzbl (%esi),%ebx \n" + "lea 1(%esi),%esi \n" "movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n" - "movzbl (%edx),%eax\n" + "movzbl (%edx),%eax \n" "paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" - "lea 1(%edx),%edx\n" + "lea 1(%edx),%edx \n" "paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n" - "psraw $0x6,%mm0\n" - "packuswb %mm0,%mm0\n" - "movd %mm0,0x0(%ebp)\n" - "lea 4(%ebp),%ebp\n" - "sub $0x1,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "psraw $0x6,%mm0 \n" + "packuswb %mm0,%mm0 \n" + "movd %mm0,0x0(%ebp) \n" + "lea 4(%ebp),%ebp \n" + "sub $0x1,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); void FastConvertYToARGBRow_MMX(const uint8* y_buf, uint8* rgb_buf, int width); asm( - ".text\n" + ".text \n" #if defined(OSX) || defined(IOS) - ".globl _FastConvertYToARGBRow_MMX\n" -"_FastConvertYToARGBRow_MMX:\n" + ".globl _FastConvertYToARGBRow_MMX \n" +"_FastConvertYToARGBRow_MMX: \n" #else - ".global FastConvertYToARGBRow_MMX\n" -"FastConvertYToARGBRow_MMX:\n" + ".global FastConvertYToARGBRow_MMX \n" +"FastConvertYToARGBRow_MMX: \n" #endif - "push %ebx\n" - "mov 0x8(%esp),%eax\n" - "mov 0xc(%esp),%edx\n" - "mov 0x10(%esp),%ecx\n" + "push %ebx \n" + "mov 0x8(%esp),%eax \n" + "mov 0xc(%esp),%edx \n" + "mov 0x10(%esp),%ecx \n" -"1:" - "movzbl (%eax),%ebx\n" +"1: \n" + "movzbl (%eax),%ebx \n" "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n" - "psraw $0x6,%mm0\n" - "movzbl 0x1(%eax),%ebx\n" + "psraw $0x6,%mm0 \n" + "movzbl 0x1(%eax),%ebx \n" "movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n" - "psraw $0x6,%mm1\n" - "packuswb %mm1,%mm0\n" - "lea 0x2(%eax),%eax\n" - "movq %mm0,(%edx)\n" - "lea 0x8(%edx),%edx\n" - "sub $0x2,%ecx\n" - "ja 1b\n" - "pop %ebx\n" - "ret\n" + "psraw $0x6,%mm1 \n" + "packuswb %mm1,%mm0 \n" + "lea 0x2(%eax),%eax \n" + "movq %mm0,(%edx) \n" + "lea 0x8(%edx),%edx \n" + "sub $0x2,%ecx \n" + "ja 1b \n" + "pop %ebx \n" + "ret \n" ); #endif diff --git a/source/row_win.cc b/source/row_win.cc index 912ad866c..a0b1cc594 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -92,7 +92,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 - wloop: + convertloop: movq xmm0, qword ptr [eax] lea eax, [eax + 8] punpcklbw xmm0, xmm0 @@ -105,7 +105,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 - ja wloop + ja convertloop ret } } @@ -753,18 +753,18 @@ SIMD_ALIGNED(const int16 kUVBiasR[8]) = { __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ - __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ - __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ - __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ - __asm psubw xmm1, kUVBiasG \ - __asm psubw xmm2, kUVBiasR \ + __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ /* Step 2: Find Y contribution to 8 R,G,B values */ \ __asm movq xmm3, qword ptr [eax] \ __asm lea eax, [eax + 8] \ __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ - __asm pmullw xmm3, kYToRgb \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ __asm paddw xmm0, xmm3 /* B += Y */ \ __asm paddw xmm1, xmm3 /* G += Y */ \ __asm paddw xmm2, xmm3 /* R += Y */ \ diff --git a/source/scale.cc b/source/scale.cc index e2b3e085d..25b8de47e 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -51,13 +51,12 @@ void SetUseReferenceImpl(bool use) { #define HAS_SCALEROWDOWN2_NEON void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, uint8* dst, int dst_width) { - __asm__ volatile - ( - "1:\n" - "vld2.u8 {q0,q1}, [%0]!\n" // load even pixels into q0, odd into q1 - "vst1.u8 {q0}, [%1]!\n" // store even pixels - "subs %2, %2, #16\n" // 16 processed per loop - "bhi 1b\n" + asm volatile ( + "1: \n" + "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 + "vst1.u8 {q0}, [%1]! \n" // store even pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 @@ -68,21 +67,20 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst, int dst_width) { - __asm__ volatile - ( - "add %1, %0\n" // change the stride to row 2 pointer - "1:\n" - "vld1.u8 {q0,q1}, [%0]!\n" // load row 1 and post increment - "vld1.u8 {q2,q3}, [%1]!\n" // load row 2 and post increment - "vpaddl.u8 q0, q0\n" // row 1 add adjacent - "vpaddl.u8 q1, q1\n" - "vpadal.u8 q0, q2\n" // row 2 add adjacent, add row 1 to row 2 - "vpadal.u8 q1, q3\n" - "vrshrn.u16 d0, q0, #2\n" // downshift, round and pack - "vrshrn.u16 d1, q1, #2\n" - "vst1.u8 {q0}, [%2]!\n" - "subs %3, %3, #16\n" // 16 processed per loop - "bhi 1b\n" + asm volatile ( + "add %1, %0 \n" // change the stride to row 2 pointer + "1: \n" + "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment + "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent + "vpaddl.u8 q1, q1 \n" + "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 + "vpadal.u8 q1, q3 \n" + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #2 \n" + "vst1.u8 {q0}, [%2]! \n" + "subs %3, %3, #16 \n" // 16 processed per loop + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 @@ -95,16 +93,15 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN4_NEON static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "1:\n" - "vld2.u8 {d0, d1}, [%0]!\n" - "vtrn.u8 d1, d0\n" - "vshrn.u16 d0, q0, #8\n" - "vst1.u32 {d0[1]}, [%1]!\n" + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" + "vtrn.u8 d1, d0 \n" + "vshrn.u16 d0, q0, #8 \n" + "vst1.u32 {d0[1]}, [%1]! \n" - "subs %2, #4\n" - "bhi 1b\n" + "subs %2, #4 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -115,31 +112,30 @@ static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "add r4, %0, %3\n" - "add r5, r4, %3\n" - "add %3, r5, %3\n" - "1:\n" - "vld1.u8 {q0}, [%0]!\n" // load up 16x4 block of input data - "vld1.u8 {q1}, [r4]!\n" - "vld1.u8 {q2}, [r5]!\n" - "vld1.u8 {q3}, [%3]!\n" + asm volatile ( + "add r4, %0, %3 \n" + "add r5, r4, %3 \n" + "add %3, r5, %3 \n" + "1: \n" + "vld1.u8 {q0}, [%0]! \n" // load up 16x4 block of input data + "vld1.u8 {q1}, [r4]! \n" + "vld1.u8 {q2}, [r5]! \n" + "vld1.u8 {q3}, [%3]! \n" - "vpaddl.u8 q0, q0\n" - "vpadal.u8 q0, q1\n" - "vpadal.u8 q0, q2\n" - "vpadal.u8 q0, q3\n" + "vpaddl.u8 q0, q0 \n" + "vpadal.u8 q0, q1 \n" + "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q3 \n" - "vpaddl.u16 q0, q0\n" + "vpaddl.u16 q0, q0 \n" - "vrshrn.u32 d0, q0, #4\n" // divide by 16 w/rounding + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding - "vmovn.u16 d0, q0\n" - "vst1.u32 {d0[0]}, [%1]!\n" + "vmovn.u16 d0, q0 \n" + "vst1.u32 {d0[0]}, [%1]! \n" - "subs %2, #4\n" - "bhi 1b\n" + "subs %2, #4 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -155,14 +151,13 @@ static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, // Point samples 32 pixels to 24 pixels. static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "1:\n" - "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0 - "vmov d2, d3\n" // order needs to be d0, d1, d2 - "vst3.u8 {d0, d1, d2}, [%1]!\n" - "subs %2, #24\n" - "bhi 1b\n" + asm volatile ( + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vmov d2, d3 \n" // order needs to be d0, d1, d2 + "vst3.u8 {d0, d1, d2}, [%1]! \n" + "subs %2, #24 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -173,51 +168,50 @@ static void ScaleRowDown34_NEON(const uint8* src_ptr, int /* src_stride */, static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "vmov.u8 d24, #3\n" - "add %3, %0\n" - "1:\n" - "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]!\n" // src line 1 + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together - "vmovl.u8 q8, d4\n" - "vmovl.u8 q9, d5\n" - "vmovl.u8 q10, d6\n" - "vmovl.u8 q11, d7\n" + "vmovl.u8 q8, d4 \n" + "vmovl.u8 q9, d5 \n" + "vmovl.u8 q10, d6 \n" + "vmovl.u8 q11, d7 \n" // 3 * line_0 + line_1 - "vmlal.u8 q8, d0, d24\n" - "vmlal.u8 q9, d1, d24\n" - "vmlal.u8 q10, d2, d24\n" - "vmlal.u8 q11, d3, d24\n" + "vmlal.u8 q8, d0, d24 \n" + "vmlal.u8 q9, d1, d24 \n" + "vmlal.u8 q10, d2, d24 \n" + "vmlal.u8 q11, d3, d24 \n" // (3 * line_0 + line_1) >> 2 - "vqrshrn.u16 d0, q8, #2\n" - "vqrshrn.u16 d1, q9, #2\n" - "vqrshrn.u16 d2, q10, #2\n" - "vqrshrn.u16 d3, q11, #2\n" + "vqrshrn.u16 d0, q8, #2 \n" + "vqrshrn.u16 d1, q9, #2 \n" + "vqrshrn.u16 d2, q10, #2 \n" + "vqrshrn.u16 d3, q11, #2 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q8, d1\n" - "vmlal.u8 q8, d0, d24\n" - "vqrshrn.u16 d0, q8, #2\n" + "vmovl.u8 q8, d1 \n" + "vmlal.u8 q8, d0, d24 \n" + "vqrshrn.u16 d0, q8, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2\n" + "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q8, d2\n" - "vmlal.u8 q8, d3, d24\n" - "vqrshrn.u16 d2, q8, #2\n" + "vmovl.u8 q8, d2 \n" + "vmlal.u8 q8, d3, d24 \n" + "vqrshrn.u16 d2, q8, #2 \n" - "vst3.u8 {d0, d1, d2}, [%1]!\n" + "vst3.u8 {d0, d1, d2}, [%1]! \n" - "subs %2, #24\n" - "bhi 1b\n" + "subs %2, #24 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -229,35 +223,34 @@ static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "vmov.u8 d24, #3\n" - "add %3, %0\n" - "1:\n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + asm volatile ( + "vmov.u8 d24, #3 \n" + "add %3, %0 \n" + "1: \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 // average src line 0 with src line 1 - "vrhadd.u8 q0, q0, q2\n" - "vrhadd.u8 q1, q1, q3\n" + "vrhadd.u8 q0, q0, q2 \n" + "vrhadd.u8 q1, q1, q3 \n" // a0 = (src[0] * 3 + s[1] * 1) >> 2 - "vmovl.u8 q3, d1\n" - "vmlal.u8 q3, d0, d24\n" - "vqrshrn.u16 d0, q3, #2\n" + "vmovl.u8 q3, d1 \n" + "vmlal.u8 q3, d0, d24 \n" + "vqrshrn.u16 d0, q3, #2 \n" // a1 = (src[1] * 1 + s[2] * 1) >> 1 - "vrhadd.u8 d1, d1, d2\n" + "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3) >> 2 - "vmovl.u8 q3, d2\n" - "vmlal.u8 q3, d3, d24\n" - "vqrshrn.u16 d2, q3, #2\n" + "vmovl.u8 q3, d2 \n" + "vmlal.u8 q3, d3, d24 \n" + "vqrshrn.u16 d2, q3, #2 \n" - "vst3.u8 {d0, d1, d2}, [%1]!\n" + "vst3.u8 {d0, d1, d2}, [%1]! \n" - "subs %2, #24\n" - "bhi 1b\n" + "subs %2, #24 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -282,17 +275,16 @@ const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) = // 32 -> 12 static void ScaleRowDown38_NEON(const uint8* src_ptr, int, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "vld1.u8 {q3}, [%3]\n" - "1:\n" - "vld1.u8 {d0, d1, d2, d3}, [%0]!\n" - "vtbl.u8 d4, {d0, d1, d2, d3}, d6\n" - "vtbl.u8 d5, {d0, d1, d2, d3}, d7\n" - "vst1.u8 {d4}, [%1]!\n" - "vst1.u32 {d5[0]}, [%1]!\n" - "subs %2, #12\n" - "bhi 1b\n" + asm volatile ( + "vld1.u8 {q3}, [%3] \n" + "1: \n" + "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" + "vst1.u8 {d4}, [%1]! \n" + "vst1.u32 {d5[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -304,59 +296,58 @@ static void ScaleRowDown38_NEON(const uint8* src_ptr, int, // 32x3 -> 12x1 static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "vld1.u16 {q13}, [%4]\n" - "vld1.u8 {q14}, [%5]\n" - "vld1.u8 {q15}, [%6]\n" - "add r4, %0, %3, lsl #1\n" - "add %3, %0\n" - "1:\n" + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "vld1.u8 {q15}, [%6] \n" + "add r4, %0, %3, lsl #1 \n" + "add %3, %0 \n" + "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" - "vld4.u8 {d4, d5, d6, d7}, [%3]!\n" - "vld4.u8 {d16, d17, d18, d19}, [r4]!\n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1\n" - "vtrn.u8 d4, d5\n" - "vtrn.u8 d16, d17\n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" + "vtrn.u8 d16, d17 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3\n" - "vtrn.u8 d6, d7\n" - "vtrn.u8 d18, d19\n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" + "vtrn.u8 d18, d19 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0\n" - "vpaddl.u8 q2, q2\n" - "vpaddl.u8 q8, q8\n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" + "vpaddl.u8 q8, q8 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3\n" - "vpaddl.u8 d7, d7\n" - "vpaddl.u8 d19, d19\n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" + "vpaddl.u8 d19, d19 \n" // combine source lines - "vadd.u16 q0, q2\n" - "vadd.u16 q0, q8\n" - "vadd.u16 d4, d3, d7\n" - "vadd.u16 d4, d19\n" + "vadd.u16 q0, q2 \n" + "vadd.u16 q0, q8 \n" + "vadd.u16 d4, d3, d7 \n" + "vadd.u16 d4, d19 \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 - "vqrdmulh.s16 q2, q13\n" - "vmovn.u16 d4, q2\n" + "vqrdmulh.s16 q2, q13 \n" + "vmovn.u16 d4, q2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -364,41 +355,41 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2\n" - "vmovl.u8 q3, d6\n" - "vmovl.u8 q9, d18\n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" + "vmovl.u8 q9, d18 \n" // combine source lines - "vadd.u16 q1, q3\n" - "vadd.u16 q1, q9\n" + "vadd.u16 q1, q3 \n" + "vadd.u16 q1, q9 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3\n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3\n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1\n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "vqrdmulh.s16 q0, q15\n" + "vqrdmulh.s16 q0, q15 \n" // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4\n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28\n" - "vtbl.u8 d4, {d0, d1, d2}, d29\n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.u8 {d3}, [%1]!\n" - "vst1.u32 {d4[0]}, [%1]!\n" - "subs %2, #12\n" - "bhi 1b\n" + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -414,47 +405,46 @@ static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride, // 32x2 -> 12x1 static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - __asm__ volatile - ( - "vld1.u16 {q13}, [%4]\n" - "vld1.u8 {q14}, [%5]\n" - "add %3, %0\n" - "1:\n" + asm volatile ( + "vld1.u16 {q13}, [%4] \n" + "vld1.u8 {q14}, [%5] \n" + "add %3, %0 \n" + "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]!\n" - "vld4.u8 {d4, d5, d6, d7}, [%3]!\n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 - "vtrn.u8 d0, d1\n" - "vtrn.u8 d4, d5\n" + "vtrn.u8 d0, d1 \n" + "vtrn.u8 d4, d5 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 - "vtrn.u8 d2, d3\n" - "vtrn.u8 d6, d7\n" + "vtrn.u8 d2, d3 \n" + "vtrn.u8 d6, d7 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 - "vpaddl.u8 q0, q0\n" - "vpaddl.u8 q2, q2\n" + "vpaddl.u8 q0, q0 \n" + "vpaddl.u8 q2, q2 \n" // d3 = 60+70 61+71 62+72 63+73 - "vpaddl.u8 d3, d3\n" - "vpaddl.u8 d7, d7\n" + "vpaddl.u8 d3, d3 \n" + "vpaddl.u8 d7, d7 \n" // combine source lines - "vadd.u16 q0, q2\n" - "vadd.u16 d4, d3, d7\n" + "vadd.u16 q0, q2 \n" + "vadd.u16 d4, d3, d7 \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 - "vqrshrn.u16 d4, q2, #2\n" + "vqrshrn.u16 d4, q2, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This @@ -462,39 +452,39 @@ static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride, // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 - "vmovl.u8 q1, d2\n" - "vmovl.u8 q3, d6\n" + "vmovl.u8 q1, d2 \n" + "vmovl.u8 q3, d6 \n" // combine source lines - "vadd.u16 q1, q3\n" + "vadd.u16 q1, q3 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 - "vtrn.u32 d2, d3\n" + "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 - "vtrn.u16 d2, d3\n" + "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 - "vadd.u16 q0, q1\n" + "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. - "vqrdmulh.s16 q0, q13\n" + "vqrdmulh.s16 q0, q13 \n" // Align for table lookup, vtbl requires registers to // be adjacent - "vmov.u8 d2, d4\n" + "vmov.u8 d2, d4 \n" - "vtbl.u8 d3, {d0, d1, d2}, d28\n" - "vtbl.u8 d4, {d0, d1, d2}, d29\n" + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.u8 {d3}, [%1]!\n" - "vst1.u32 {d4[0]}, [%1]!\n" - "subs %2, #12\n" - "bhi 1b\n" + "vst1.u8 {d3}, [%1]! \n" + "vst1.u32 {d4[0]}, [%1]! \n" + "subs %2, #12 \n" + "bhi 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -1461,20 +1451,20 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #define HAS_SCALEROWDOWN2_SSE2 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm5,%%xmm0\n" - "pand %%xmm5,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1485,30 +1475,30 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlw $0x8,%%xmm5\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%3,1),%%xmm2\n" - "movdqa 0x10(%0,%3,1),%%xmm3\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "psrlw $0x8,%%xmm0\n" - "movdqa %%xmm1,%%xmm3\n" - "psrlw $0x8,%%xmm1\n" - "pand %%xmm5,%%xmm2\n" - "pand %%xmm5,%%xmm3\n" - "pavgw %%xmm2,%%xmm0\n" - "pavgw %%xmm3,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%1)\n" - "lea 0x10(%1),%1\n" - "sub $0x10,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1520,21 +1510,21 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN4_SSE2 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrld $0x18,%%xmm5\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x18,%%xmm5 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm5,%%xmm0\n" - "pand %%xmm5,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1546,44 +1536,44 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { intptr_t temp = 0; - asm volatile( - "pcmpeqb %%xmm7,%%xmm7\n" - "psrlw $0x8,%%xmm7\n" - "lea (%4,%4,2),%3\n" + asm volatile ( + "pcmpeqb %%xmm7,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "lea (%4,%4,2),%3 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%4,1),%%xmm2\n" - "movdqa 0x10(%0,%4,1),%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa (%0,%4,2),%%xmm2\n" - "movdqa 0x10(%0,%4,2),%%xmm3\n" - "movdqa (%0,%3,1),%%xmm4\n" - "movdqa 0x10(%0,%3,1),%%xmm5\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm5,%%xmm3\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "psrlw $0x8,%%xmm0\n" - "movdqa %%xmm1,%%xmm3\n" - "psrlw $0x8,%%xmm1\n" - "pand %%xmm7,%%xmm2\n" - "pand %%xmm7,%%xmm3\n" - "pavgw %%xmm2,%%xmm0\n" - "pavgw %%xmm3,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,%%xmm2\n" - "psrlw $0x8,%%xmm0\n" - "pand %%xmm7,%%xmm2\n" - "pavgw %%xmm2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "lea 0x8(%1),%1\n" - "sub $0x8,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%4,1),%%xmm2 \n" + "movdqa 0x10(%0,%4,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%4,2),%%xmm2 \n" + "movdqa 0x10(%0,%4,2),%%xmm3 \n" + "movdqa (%0,%3,1),%%xmm4 \n" + "movdqa 0x10(%0,%3,1),%%xmm5 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm7,%%xmm2 \n" + "pand %%xmm7,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm7,%%xmm2 \n" + "pavgw %%xmm2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 @@ -1599,22 +1589,22 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN8_SSE2 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "pcmpeqb %%xmm5,%%xmm5\n" - "psrlq $0x38,%%xmm5\n" + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlq $0x38,%%xmm5 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pand %%xmm5,%%xmm0\n" - "pand %%xmm5,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "lea 0x4(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -1627,69 +1617,69 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown8Int_SSE2\n" -"_ScaleRowDown8Int_SSE2:\n" + ".globl _ScaleRowDown8Int_SSE2 \n" +"_ScaleRowDown8Int_SSE2: \n" #else - ".global ScaleRowDown8Int_SSE2\n" -"ScaleRowDown8Int_SSE2:\n" + ".global ScaleRowDown8Int_SSE2 \n" +"ScaleRowDown8Int_SSE2: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%ebx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "lea (%ebx,%ebx,2),%edx\n" - "pxor %xmm7,%xmm7\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "lea (%ebx,%ebx,2),%edx \n" + "pxor %xmm7,%xmm7 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm1\n" - "movdqa (%esi,%ebx,1),%xmm2\n" - "movdqa 0x10(%esi,%ebx,1),%xmm3\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "movdqa (%esi,%ebx,2),%xmm2\n" - "movdqa 0x10(%esi,%ebx,2),%xmm3\n" - "movdqa (%esi,%edx,1),%xmm4\n" - "movdqa 0x10(%esi,%edx,1),%xmm5\n" - "lea (%esi,%ebx,4),%ebp\n" - "lea 0x20(%esi),%esi\n" - "pavgb %xmm4,%xmm2\n" - "pavgb %xmm5,%xmm3\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "movdqa 0x0(%ebp),%xmm2\n" - "movdqa 0x10(%ebp),%xmm3\n" - "movdqa 0x0(%ebp,%ebx,1),%xmm4\n" - "movdqa 0x10(%ebp,%ebx,1),%xmm5\n" - "pavgb %xmm4,%xmm2\n" - "pavgb %xmm5,%xmm3\n" - "movdqa 0x0(%ebp,%ebx,2),%xmm4\n" - "movdqa 0x10(%ebp,%ebx,2),%xmm5\n" - "movdqa 0x0(%ebp,%edx,1),%xmm6\n" - "pavgb %xmm6,%xmm4\n" - "movdqa 0x10(%ebp,%edx,1),%xmm6\n" - "pavgb %xmm6,%xmm5\n" - "pavgb %xmm4,%xmm2\n" - "pavgb %xmm5,%xmm3\n" - "pavgb %xmm2,%xmm0\n" - "pavgb %xmm3,%xmm1\n" - "psadbw %xmm7,%xmm0\n" - "psadbw %xmm7,%xmm1\n" - "pshufd $0xd8,%xmm0,%xmm0\n" - "pshufd $0x8d,%xmm1,%xmm1\n" - "por %xmm1,%xmm0\n" - "psrlw $0x3,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movd %xmm0,(%edi)\n" - "lea 0x4(%edi),%edi\n" - "sub $0x4,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "movdqa (%esi,%ebx,1),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,1),%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa (%esi,%ebx,2),%xmm2 \n" + "movdqa 0x10(%esi,%ebx,2),%xmm3 \n" + "movdqa (%esi,%edx,1),%xmm4 \n" + "movdqa 0x10(%esi,%edx,1),%xmm5 \n" + "lea (%esi,%ebx,4),%ebp \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "movdqa 0x0(%ebp),%xmm2 \n" + "movdqa 0x10(%ebp),%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,1),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,1),%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "movdqa 0x0(%ebp,%ebx,2),%xmm4 \n" + "movdqa 0x10(%ebp,%ebx,2),%xmm5 \n" + "movdqa 0x0(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm4 \n" + "movdqa 0x10(%ebp,%edx,1),%xmm6 \n" + "pavgb %xmm6,%xmm5 \n" + "pavgb %xmm4,%xmm2 \n" + "pavgb %xmm5,%xmm3 \n" + "pavgb %xmm2,%xmm0 \n" + "pavgb %xmm3,%xmm1 \n" + "psadbw %xmm7,%xmm0 \n" + "psadbw %xmm7,%xmm1 \n" + "pshufd $0xd8,%xmm0,%xmm0 \n" + "pshufd $0x8d,%xmm1,%xmm1 \n" + "por %xmm1,%xmm0 \n" + "psrlw $0x3,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "lea 0x4(%edi),%edi \n" + "sub $0x4,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); // fpic is used for magiccam plugin @@ -1698,308 +1688,308 @@ extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown34_SSSE3\n" -"_ScaleRowDown34_SSSE3:\n" + ".globl _ScaleRowDown34_SSSE3 \n" +"_ScaleRowDown34_SSSE3: \n" #else - ".global ScaleRowDown34_SSSE3\n" -"ScaleRowDown34_SSSE3:\n" + ".global ScaleRowDown34_SSSE3 \n" +"ScaleRowDown34_SSSE3: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf0,%xmm3\n" - "movdqa _shuf1,%xmm4\n" - "movdqa _shuf2,%xmm5\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf0,%xmm3 \n" + "movdqa _shuf1,%xmm4 \n" + "movdqa _shuf2,%xmm5 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm2\n" - "lea 0x20(%esi),%esi\n" - "movdqa %xmm2,%xmm1\n" - "palignr $0x8,%xmm0,%xmm1\n" - "pshufb %xmm3,%xmm0\n" - "pshufb %xmm4,%xmm1\n" - "pshufb %xmm5,%xmm2\n" - "movq %xmm0,(%edi)\n" - "movq %xmm1,0x8(%edi)\n" - "movq %xmm2,0x10(%edi)\n" - "lea 0x18(%edi),%edi\n" - "sub $0x18,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm2 \n" + "lea 0x20(%esi),%esi \n" + "movdqa %xmm2,%xmm1 \n" + "palignr $0x8,%xmm0,%xmm1 \n" + "pshufb %xmm3,%xmm0 \n" + "pshufb %xmm4,%xmm1 \n" + "pshufb %xmm5,%xmm2 \n" + "movq %xmm0,(%edi) \n" + "movq %xmm1,0x8(%edi) \n" + "movq %xmm2,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown34_1_Int_SSSE3\n" -"_ScaleRowDown34_1_Int_SSSE3:\n" + ".globl _ScaleRowDown34_1_Int_SSSE3 \n" +"_ScaleRowDown34_1_Int_SSSE3: \n" #else - ".global ScaleRowDown34_1_Int_SSSE3\n" -"ScaleRowDown34_1_Int_SSSE3:\n" + ".global ScaleRowDown34_1_Int_SSSE3 \n" +"ScaleRowDown34_1_Int_SSSE3: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%ebp\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf01,%xmm2\n" - "movdqa _shuf11,%xmm3\n" - "movdqa _shuf21,%xmm4\n" - "movdqa _madd01,%xmm5\n" - "movdqa _madd11,%xmm6\n" - "movdqa _round34,%xmm7\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%ebp),%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm2,%xmm0\n" - "pmaddubsw %xmm5,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%edi)\n" - "movdqu 0x8(%esi),%xmm0\n" - "movdqu 0x8(%esi,%ebp),%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm3,%xmm0\n" - "pmaddubsw %xmm6,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x8(%edi)\n" - "movdqa 0x10(%esi),%xmm0\n" - "movdqa 0x10(%esi,%ebp),%xmm1\n" - "lea 0x20(%esi),%esi\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm4,%xmm0\n" - "movdqa _madd21,%xmm1\n" - "pmaddubsw %xmm1,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x10(%edi)\n" - "lea 0x18(%edi),%edi\n" - "sub $0x18,%ecx\n" - "ja 1b\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp),%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" - "popa\n" - "ret\n" + "popa \n" + "ret \n" ); extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown34_0_Int_SSSE3\n" -"_ScaleRowDown34_0_Int_SSSE3:\n" + ".globl _ScaleRowDown34_0_Int_SSSE3 \n" +"_ScaleRowDown34_0_Int_SSSE3: \n" #else - ".global ScaleRowDown34_0_Int_SSSE3\n" -"ScaleRowDown34_0_Int_SSSE3:\n" + ".global ScaleRowDown34_0_Int_SSSE3 \n" +"ScaleRowDown34_0_Int_SSSE3: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%ebp\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf01,%xmm2\n" - "movdqa _shuf11,%xmm3\n" - "movdqa _shuf21,%xmm4\n" - "movdqa _madd01,%xmm5\n" - "movdqa _madd11,%xmm6\n" - "movdqa _round34,%xmm7\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%ebp \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf01,%xmm2 \n" + "movdqa _shuf11,%xmm3 \n" + "movdqa _shuf21,%xmm4 \n" + "movdqa _madd01,%xmm5 \n" + "movdqa _madd11,%xmm6 \n" + "movdqa _round34,%xmm7 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%ebp,1),%xmm1\n" - "pavgb %xmm0,%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm2,%xmm0\n" - "pmaddubsw %xmm5,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,(%edi)\n" - "movdqu 0x8(%esi),%xmm0\n" - "movdqu 0x8(%esi,%ebp,1),%xmm1\n" - "pavgb %xmm0,%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm3,%xmm0\n" - "pmaddubsw %xmm6,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x8(%edi)\n" - "movdqa 0x10(%esi),%xmm0\n" - "movdqa 0x10(%esi,%ebp,1),%xmm1\n" - "lea 0x20(%esi),%esi\n" - "pavgb %xmm0,%xmm1\n" - "pavgb %xmm1,%xmm0\n" - "pshufb %xmm4,%xmm0\n" - "movdqa _madd21,%xmm1\n" - "pmaddubsw %xmm1,%xmm0\n" - "paddsw %xmm7,%xmm0\n" - "psrlw $0x2,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movq %xmm0,0x10(%edi)\n" - "lea 0x18(%edi),%edi\n" - "sub $0x18,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm2,%xmm0 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movdqu 0x8(%esi),%xmm0 \n" + "movdqu 0x8(%esi,%ebp,1),%xmm1 \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm3,%xmm0 \n" + "pmaddubsw %xmm6,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x8(%edi) \n" + "movdqa 0x10(%esi),%xmm0 \n" + "movdqa 0x10(%esi,%ebp,1),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pavgb %xmm0,%xmm1 \n" + "pavgb %xmm1,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa _madd21,%xmm1 \n" + "pmaddubsw %xmm1,%xmm0 \n" + "paddsw %xmm7,%xmm0 \n" + "psrlw $0x2,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movq %xmm0,0x10(%edi) \n" + "lea 0x18(%edi),%edi \n" + "sub $0x18,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); #define HAS_SCALEROWDOWN38_SSSE3 extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown38_SSSE3\n" -"_ScaleRowDown38_SSSE3:\n" + ".globl _ScaleRowDown38_SSSE3 \n" +"_ScaleRowDown38_SSSE3: \n" #else - ".global ScaleRowDown38_SSSE3\n" -"ScaleRowDown38_SSSE3:\n" + ".global ScaleRowDown38_SSSE3 \n" +"ScaleRowDown38_SSSE3: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shuf38a ,%xmm4\n" - "movdqa _shuf38b ,%xmm5\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shuf38a ,%xmm4 \n" + "movdqa _shuf38b ,%xmm5 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa 0x10(%esi),%xmm1\n" - "lea 0x20(%esi),%esi\n" - "pshufb %xmm4,%xmm0\n" - "pshufb %xmm5,%xmm1\n" - "paddusb %xmm1,%xmm0\n" - "movq %xmm0,(%edi)\n" - "movhlps %xmm0,%xmm1\n" - "movd %xmm1,0x8(%edi)\n" - "lea 0xc(%edi),%edi\n" - "sub $0xc,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa 0x10(%esi),%xmm1 \n" + "lea 0x20(%esi),%esi \n" + "pshufb %xmm4,%xmm0 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusb %xmm1,%xmm0 \n" + "movq %xmm0,(%edi) \n" + "movhlps %xmm0,%xmm1 \n" + "movd %xmm1,0x8(%edi) \n" + "lea 0xc(%edi),%edi \n" + "sub $0xc,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown38_3_Int_SSSE3\n" -"_ScaleRowDown38_3_Int_SSSE3:\n" + ".globl _ScaleRowDown38_3_Int_SSSE3 \n" +"_ScaleRowDown38_3_Int_SSSE3: \n" #else - ".global ScaleRowDown38_3_Int_SSSE3\n" -"ScaleRowDown38_3_Int_SSSE3:\n" + ".global ScaleRowDown38_3_Int_SSSE3 \n" +"ScaleRowDown38_3_Int_SSSE3: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shufac0,%xmm4\n" - "movdqa _shufac3,%xmm5\n" - "movdqa _scaleac3,%xmm6\n" - "pxor %xmm7,%xmm7\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufac0,%xmm4 \n" + "movdqa _shufac3,%xmm5 \n" + "movdqa _scaleac3,%xmm6 \n" + "pxor %xmm7,%xmm7 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "movhlps %xmm0,%xmm1\n" - "movhlps %xmm2,%xmm3\n" - "punpcklbw %xmm7,%xmm0\n" - "punpcklbw %xmm7,%xmm1\n" - "punpcklbw %xmm7,%xmm2\n" - "punpcklbw %xmm7,%xmm3\n" - "paddusw %xmm2,%xmm0\n" - "paddusw %xmm3,%xmm1\n" - "movdqa (%esi,%edx,2),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movhlps %xmm2,%xmm3\n" - "punpcklbw %xmm7,%xmm2\n" - "punpcklbw %xmm7,%xmm3\n" - "paddusw %xmm2,%xmm0\n" - "paddusw %xmm3,%xmm1\n" - "movdqa %xmm0,%xmm2\n" - "psrldq $0x2,%xmm0\n" - "paddusw %xmm0,%xmm2\n" - "psrldq $0x2,%xmm0\n" - "paddusw %xmm0,%xmm2\n" - "pshufb %xmm4,%xmm2\n" - "movdqa %xmm1,%xmm3\n" - "psrldq $0x2,%xmm1\n" - "paddusw %xmm1,%xmm3\n" - "psrldq $0x2,%xmm1\n" - "paddusw %xmm1,%xmm3\n" - "pshufb %xmm5,%xmm3\n" - "paddusw %xmm3,%xmm2\n" - "pmulhuw %xmm6,%xmm2\n" - "packuswb %xmm2,%xmm2\n" - "movd %xmm2,(%edi)\n" - "pextrw $0x2,%xmm2,%eax\n" - "mov %ax,0x4(%edi)\n" - "lea 0x6(%edi),%edi\n" - "sub $0x6,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "movhlps %xmm0,%xmm1 \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm1 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa (%esi,%edx,2),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movhlps %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpcklbw %xmm7,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "movdqa %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "psrldq $0x2,%xmm0 \n" + "paddusw %xmm0,%xmm2 \n" + "pshufb %xmm4,%xmm2 \n" + "movdqa %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "psrldq $0x2,%xmm1 \n" + "paddusw %xmm1,%xmm3 \n" + "pshufb %xmm5,%xmm3 \n" + "paddusw %xmm3,%xmm2 \n" + "pmulhuw %xmm6,%xmm2 \n" + "packuswb %xmm2,%xmm2 \n" + "movd %xmm2,(%edi) \n" + "pextrw $0x2,%xmm2,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleRowDown38_2_Int_SSSE3\n" -"_ScaleRowDown38_2_Int_SSSE3:\n" + ".globl _ScaleRowDown38_2_Int_SSSE3 \n" +"_ScaleRowDown38_2_Int_SSSE3: \n" #else - ".global ScaleRowDown38_2_Int_SSSE3\n" -"ScaleRowDown38_2_Int_SSSE3:\n" + ".global ScaleRowDown38_2_Int_SSSE3 \n" +"ScaleRowDown38_2_Int_SSSE3: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "movdqa _shufab0,%xmm4\n" - "movdqa _shufab1,%xmm5\n" - "movdqa _shufab2,%xmm6\n" - "movdqa _scaleab2,%xmm7\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "movdqa _shufab0,%xmm4 \n" + "movdqa _shufab1,%xmm5 \n" + "movdqa _shufab2,%xmm6 \n" + "movdqa _scaleab2,%xmm7 \n" "1:" - "movdqa (%esi),%xmm2\n" - "pavgb (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm2,%xmm0\n" - "pshufb %xmm4,%xmm0\n" - "movdqa %xmm2,%xmm1\n" - "pshufb %xmm5,%xmm1\n" - "paddusw %xmm1,%xmm0\n" - "pshufb %xmm6,%xmm2\n" - "paddusw %xmm2,%xmm0\n" - "pmulhuw %xmm7,%xmm0\n" - "packuswb %xmm0,%xmm0\n" - "movd %xmm0,(%edi)\n" - "pextrw $0x2,%xmm0,%eax\n" - "mov %ax,0x4(%edi)\n" - "lea 0x6(%edi),%edi\n" - "sub $0x6,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa (%esi),%xmm2 \n" + "pavgb (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm2,%xmm0 \n" + "pshufb %xmm4,%xmm0 \n" + "movdqa %xmm2,%xmm1 \n" + "pshufb %xmm5,%xmm1 \n" + "paddusw %xmm1,%xmm0 \n" + "pshufb %xmm6,%xmm2 \n" + "paddusw %xmm2,%xmm0 \n" + "pmulhuw %xmm7,%xmm0 \n" + "packuswb %xmm0,%xmm0 \n" + "movd %xmm0,(%edi) \n" + "pextrw $0x2,%xmm0,%eax \n" + "mov %ax,0x4(%edi) \n" + "lea 0x6(%edi),%edi \n" + "sub $0x6,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); #endif // __PIC__ @@ -2008,49 +1998,49 @@ extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, uint16* dst_ptr, int src_width, int src_height); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleAddRows_SSE2\n" -"_ScaleAddRows_SSE2:\n" + ".globl _ScaleAddRows_SSE2 \n" +"_ScaleAddRows_SSE2: \n" #else - ".global ScaleAddRows_SSE2\n" -"ScaleAddRows_SSE2:\n" + ".global ScaleAddRows_SSE2 \n" +"ScaleAddRows_SSE2: \n" #endif - "pusha\n" - "mov 0x24(%esp),%esi\n" - "mov 0x28(%esp),%edx\n" - "mov 0x2c(%esp),%edi\n" - "mov 0x30(%esp),%ecx\n" - "mov 0x34(%esp),%ebx\n" - "pxor %xmm5,%xmm5\n" + "pusha \n" + "mov 0x24(%esp),%esi \n" + "mov 0x28(%esp),%edx \n" + "mov 0x2c(%esp),%edi \n" + "mov 0x30(%esp),%ecx \n" + "mov 0x34(%esp),%ebx \n" + "pxor %xmm5,%xmm5 \n" "1:" - "movdqa (%esi),%xmm2\n" - "lea (%esi,%edx,1),%eax\n" - "movhlps %xmm2,%xmm3\n" - "lea -0x1(%ebx),%ebp\n" - "punpcklbw %xmm5,%xmm2\n" - "punpcklbw %xmm5,%xmm3\n" + "movdqa (%esi),%xmm2 \n" + "lea (%esi,%edx,1),%eax \n" + "movhlps %xmm2,%xmm3 \n" + "lea -0x1(%ebx),%ebp \n" + "punpcklbw %xmm5,%xmm2 \n" + "punpcklbw %xmm5,%xmm3 \n" "2:" - "movdqa (%eax),%xmm0\n" - "lea (%eax,%edx,1),%eax\n" - "movhlps %xmm0,%xmm1\n" - "punpcklbw %xmm5,%xmm0\n" - "punpcklbw %xmm5,%xmm1\n" - "paddusw %xmm0,%xmm2\n" - "paddusw %xmm1,%xmm3\n" - "sub $0x1,%ebp\n" - "ja 2b\n" + "movdqa (%eax),%xmm0 \n" + "lea (%eax,%edx,1),%eax \n" + "movhlps %xmm0,%xmm1 \n" + "punpcklbw %xmm5,%xmm0 \n" + "punpcklbw %xmm5,%xmm1 \n" + "paddusw %xmm0,%xmm2 \n" + "paddusw %xmm1,%xmm3 \n" + "sub $0x1,%ebp \n" + "ja 2b \n" - "movdqa %xmm2,(%edi)\n" - "movdqa %xmm3,0x10(%edi)\n" - "lea 0x20(%edi),%edi\n" - "lea 0x10(%esi),%esi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "popa\n" - "ret\n" + "movdqa %xmm2,(%edi) \n" + "movdqa %xmm3,0x10(%edi) \n" + "lea 0x20(%edi),%edi \n" + "lea 0x10(%esi),%esi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "popa \n" + "ret \n" ); // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version @@ -2059,93 +2049,93 @@ extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleFilterRows_SSE2\n" -"_ScaleFilterRows_SSE2:\n" + ".globl _ScaleFilterRows_SSE2 \n" +"_ScaleFilterRows_SSE2: \n" #else - ".global ScaleFilterRows_SSE2\n" -"ScaleFilterRows_SSE2:\n" + ".global ScaleFilterRows_SSE2 \n" +"ScaleFilterRows_SSE2: \n" #endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%edi\n" - "mov 0x10(%esp),%esi\n" - "mov 0x14(%esp),%edx\n" - "mov 0x18(%esp),%ecx\n" - "mov 0x1c(%esp),%eax\n" - "cmp $0x0,%eax\n" - "je 2f\n" - "cmp $0x80,%eax\n" - "je 3f\n" - "movd %eax,%xmm6\n" - "punpcklwd %xmm6,%xmm6\n" - "pshufd $0x0,%xmm6,%xmm6\n" - "neg %eax\n" - "add $0x100,%eax\n" - "movd %eax,%xmm5\n" - "punpcklwd %xmm5,%xmm5\n" - "pshufd $0x0,%xmm5,%xmm5\n" - "pxor %xmm7,%xmm7\n" + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "movd %eax,%xmm6 \n" + "punpcklwd %xmm6,%xmm6 \n" + "pshufd $0x0,%xmm6,%xmm6 \n" + "neg %eax \n" + "add $0x100,%eax \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" + "pxor %xmm7,%xmm7 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,%xmm1\n" - "movdqa %xmm2,%xmm3\n" - "punpcklbw %xmm7,%xmm0\n" - "punpcklbw %xmm7,%xmm2\n" - "punpckhbw %xmm7,%xmm1\n" - "punpckhbw %xmm7,%xmm3\n" - "pmullw %xmm5,%xmm0\n" - "pmullw %xmm5,%xmm1\n" - "pmullw %xmm6,%xmm2\n" - "pmullw %xmm6,%xmm3\n" - "paddusw %xmm2,%xmm0\n" - "paddusw %xmm3,%xmm1\n" - "psrlw $0x8,%xmm0\n" - "psrlw $0x8,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "movdqa %xmm2,%xmm3 \n" + "punpcklbw %xmm7,%xmm0 \n" + "punpcklbw %xmm7,%xmm2 \n" + "punpckhbw %xmm7,%xmm1 \n" + "punpckhbw %xmm7,%xmm3 \n" + "pmullw %xmm5,%xmm0 \n" + "pmullw %xmm5,%xmm1 \n" + "pmullw %xmm6,%xmm2 \n" + "pmullw %xmm6,%xmm3 \n" + "paddusw %xmm2,%xmm0 \n" + "paddusw %xmm3,%xmm1 \n" + "psrlw $0x8,%xmm0 \n" + "psrlw $0x8,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" "2:" - "movdqa (%esi),%xmm0\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 2b\n" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" "3:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "pavgb %xmm2,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 3b\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" ); // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version @@ -2154,135 +2144,135 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction); asm( - ".text\n" + ".text \n" #if defined(OSX) - ".globl _ScaleFilterRows_SSSE3\n" -"_ScaleFilterRows_SSSE3:\n" + ".globl _ScaleFilterRows_SSSE3 \n" +"_ScaleFilterRows_SSSE3: \n" #else - ".global ScaleFilterRows_SSSE3\n" -"ScaleFilterRows_SSSE3:\n" + ".global ScaleFilterRows_SSSE3 \n" +"ScaleFilterRows_SSSE3: \n" #endif - "push %esi\n" - "push %edi\n" - "mov 0xc(%esp),%edi\n" - "mov 0x10(%esp),%esi\n" - "mov 0x14(%esp),%edx\n" - "mov 0x18(%esp),%ecx\n" - "mov 0x1c(%esp),%eax\n" - "cmp $0x0,%eax\n" - "je 2f\n" - "cmp $0x80,%eax\n" - "je 3f\n" - "shr %eax\n" - "mov %al,%ah\n" - "neg %al\n" - "add $0x80,%al\n" - "movd %eax,%xmm5\n" - "punpcklwd %xmm5,%xmm5\n" - "pshufd $0x0,%xmm5,%xmm5\n" + "push %esi \n" + "push %edi \n" + "mov 0xc(%esp),%edi \n" + "mov 0x10(%esp),%esi \n" + "mov 0x14(%esp),%edx \n" + "mov 0x18(%esp),%ecx \n" + "mov 0x1c(%esp),%eax \n" + "cmp $0x0,%eax \n" + "je 2f \n" + "cmp $0x80,%eax \n" + "je 3f \n" + "shr %eax \n" + "mov %al,%ah \n" + "neg %al \n" + "add $0x80,%al \n" + "movd %eax,%xmm5 \n" + "punpcklwd %xmm5,%xmm5 \n" + "pshufd $0x0,%xmm5,%xmm5 \n" "1:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,%xmm1\n" - "punpcklbw %xmm2,%xmm0\n" - "punpckhbw %xmm2,%xmm1\n" - "pmaddubsw %xmm5,%xmm0\n" - "pmaddubsw %xmm5,%xmm1\n" - "psrlw $0x7,%xmm0\n" - "psrlw $0x7,%xmm1\n" - "packuswb %xmm1,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 1b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,%xmm1 \n" + "punpcklbw %xmm2,%xmm0 \n" + "punpckhbw %xmm2,%xmm1 \n" + "pmaddubsw %xmm5,%xmm0 \n" + "pmaddubsw %xmm5,%xmm1 \n" + "psrlw $0x7,%xmm0 \n" + "psrlw $0x7,%xmm1 \n" + "packuswb %xmm1,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 1b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" "2:" - "movdqa (%esi),%xmm0\n" - "lea 0x10(%esi),%esi\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 2b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "lea 0x10(%esi),%esi \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 2b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" "3:" - "movdqa (%esi),%xmm0\n" - "movdqa (%esi,%edx,1),%xmm2\n" - "lea 0x10(%esi),%esi\n" - "pavgb %xmm2,%xmm0\n" - "movdqa %xmm0,(%edi)\n" - "lea 0x10(%edi),%edi\n" - "sub $0x10,%ecx\n" - "ja 3b\n" - "mov -0x1(%edi),%al\n" - "mov %al,(%edi)\n" - "pop %edi\n" - "pop %esi\n" - "ret\n" + "movdqa (%esi),%xmm0 \n" + "movdqa (%esi,%edx,1),%xmm2 \n" + "lea 0x10(%esi),%esi \n" + "pavgb %xmm2,%xmm0 \n" + "movdqa %xmm0,(%edi) \n" + "lea 0x10(%edi),%edi \n" + "sub $0x10,%ecx \n" + "ja 3b \n" + "mov -0x1(%edi),%al \n" + "mov %al,(%edi) \n" + "pop %edi \n" + "pop %esi \n" + "ret \n" ); #elif defined(__x86_64__) static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "lea (%3,%3,2),%%r10\n" - "pxor %%xmm7,%%xmm7\n" + asm volatile ( + "lea (%3,%3,2),%%r10 \n" + "pxor %%xmm7,%%xmm7 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "movdqa (%0,%3,1),%%xmm2\n" - "movdqa 0x10(%0,%3,1),%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa (%0,%3,2),%%xmm2\n" - "movdqa 0x10(%0,%3,2),%%xmm3\n" - "movdqa (%0,%%r10,1),%%xmm4\n" - "movdqa 0x10(%0,%%r10,1),%%xmm5\n" - "lea (%0,%3,4),%%r11\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm5,%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "movdqa 0x0(%%r11),%%xmm2\n" - "movdqa 0x10(%%r11),%%xmm3\n" - "movdqa 0x0(%%r11,%3,1),%%xmm4\n" - "movdqa 0x10(%%r11,%3,1),%%xmm5\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm5,%%xmm3\n" - "movdqa 0x0(%%r11,%3,2),%%xmm4\n" - "movdqa 0x10(%%r11,%3,2),%%xmm5\n" - "movdqa 0x0(%%r11,%%r10,1),%%xmm6\n" - "pavgb %%xmm6,%%xmm4\n" - "movdqa 0x10(%%r11,%%r10,1),%%xmm6\n" - "pavgb %%xmm6,%%xmm5\n" - "pavgb %%xmm4,%%xmm2\n" - "pavgb %%xmm5,%%xmm3\n" - "pavgb %%xmm2,%%xmm0\n" - "pavgb %%xmm3,%%xmm1\n" - "psadbw %%xmm7,%%xmm0\n" - "psadbw %%xmm7,%%xmm1\n" - "pshufd $0xd8,%%xmm0,%%xmm0\n" - "pshufd $0x8d,%%xmm1,%%xmm1\n" - "por %%xmm1,%%xmm0\n" - "psrlw $0x3,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "lea 0x4(%1),%1\n" - "sub $0x4,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movdqa 0x10(%0,%3,1),%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "movdqa 0x10(%0,%3,2),%%xmm3 \n" + "movdqa (%0,%%r10,1),%%xmm4 \n" + "movdqa 0x10(%0,%%r10,1),%%xmm5 \n" + "lea (%0,%3,4),%%r11 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa 0x0(%%r11),%%xmm2 \n" + "movdqa 0x10(%%r11),%%xmm3 \n" + "movdqa 0x0(%%r11,%3,1),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,1),%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "movdqa 0x0(%%r11,%3,2),%%xmm4 \n" + "movdqa 0x10(%%r11,%3,2),%%xmm5 \n" + "movdqa 0x0(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm4 \n" + "movdqa 0x10(%%r11,%%r10,1),%%xmm6 \n" + "pavgb %%xmm6,%%xmm5 \n" + "pavgb %%xmm4,%%xmm2 \n" + "pavgb %%xmm5,%%xmm3 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "psadbw %%xmm7,%%xmm0 \n" + "psadbw %%xmm7,%%xmm1 \n" + "pshufd $0xd8,%%xmm0,%%xmm0 \n" + "pshufd $0x8d,%%xmm1,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "psrlw $0x3,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "lea 0x4(%1),%1 \n" + "sub $0x4,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2294,25 +2284,25 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN34_SSSE3 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%3),%%xmm3\n" - "movdqa (%4),%%xmm4\n" - "movdqa (%5),%%xmm5\n" + asm volatile ( + "movdqa (%3),%%xmm3 \n" + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm2\n" - "lea 0x20(%0),%0\n" - "movdqa %%xmm2,%%xmm1\n" - "palignr $0x8,%%xmm0,%%xmm1\n" - "pshufb %%xmm3,%%xmm0\n" - "pshufb %%xmm4,%%xmm1\n" - "pshufb %%xmm5,%%xmm2\n" - "movq %%xmm0,(%1)\n" - "movq %%xmm1,0x8(%1)\n" - "movq %%xmm2,0x10(%1)\n" - "lea 0x18(%1),%1\n" - "sub $0x18,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm2 \n" + "lea 0x20(%0),%0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "palignr $0x8,%%xmm0,%%xmm1 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm5,%%xmm2 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,0x8(%1) \n" + "movq %%xmm2,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2325,46 +2315,46 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm2\n" // _shuf01 - "movdqa (%5),%%xmm3\n" // _shuf11 - "movdqa (%6),%%xmm4\n" // _shuf21 - "movdqa (%7),%%xmm5\n" // _madd01 - "movdqa (%8),%%xmm6\n" // _madd11 - "movdqa (%9),%%xmm7\n" // _round34 - "movdqa (%10),%%xmm8\n" // _madd21 + asm volatile ( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 "1:" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3),%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm2,%%xmm0\n" - "pmaddubsw %%xmm5,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqu 0x8(%0),%%xmm0\n" - "movdqu 0x8(%0,%3),%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm3,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x8(%1)\n" - "movdqa 0x10(%0),%%xmm0\n" - "movdqa 0x10(%0,%3),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm4,%%xmm0\n" - "pmaddubsw %%xmm8,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x10(%1)\n" - "lea 0x18(%1),%1\n" - "sub $0x18,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2382,49 +2372,49 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm2\n" // _shuf01 - "movdqa (%5),%%xmm3\n" // _shuf11 - "movdqa (%6),%%xmm4\n" // _shuf21 - "movdqa (%7),%%xmm5\n" // _madd01 - "movdqa (%8),%%xmm6\n" // _madd11 - "movdqa (%9),%%xmm7\n" // _round34 - "movdqa (%10),%%xmm8\n" // _madd21 + asm volatile ( + "movdqa (%4),%%xmm2 \n" // _shuf01 + "movdqa (%5),%%xmm3 \n" // _shuf11 + "movdqa (%6),%%xmm4 \n" // _shuf21 + "movdqa (%7),%%xmm5 \n" // _madd01 + "movdqa (%8),%%xmm6 \n" // _madd11 + "movdqa (%9),%%xmm7 \n" // _round34 + "movdqa (%10),%%xmm8 \n" // _madd21 "1:" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3,1),%%xmm1\n" - "pavgb %%xmm0,%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm2,%%xmm0\n" - "pmaddubsw %%xmm5,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movdqu 0x8(%0),%%xmm0\n" - "movdqu 0x8(%0,%3,1),%%xmm1\n" - "pavgb %%xmm0,%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm3,%%xmm0\n" - "pmaddubsw %%xmm6,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x8(%1)\n" - "movdqa 0x10(%0),%%xmm0\n" - "movdqa 0x10(%0,%3,1),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pavgb %%xmm0,%%xmm1\n" - "pavgb %%xmm1,%%xmm0\n" - "pshufb %%xmm4,%%xmm0\n" - "pmaddubsw %%xmm8,%%xmm0\n" - "paddsw %%xmm7,%%xmm0\n" - "psrlw $0x2,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movq %%xmm0,0x10(%1)\n" - "lea 0x18(%1),%1\n" - "sub $0x18,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqu 0x8(%0),%%xmm0 \n" + "movdqu 0x8(%0,%3,1),%%xmm1 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm3,%%xmm0 \n" + "pmaddubsw %%xmm6,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x8(%1) \n" + "movdqa 0x10(%0),%%xmm0 \n" + "movdqa 0x10(%0,%3,1),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm0,%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm8,%%xmm0 \n" + "paddsw %%xmm7,%%xmm0 \n" + "psrlw $0x2,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,0x10(%1) \n" + "lea 0x18(%1),%1 \n" + "sub $0x18,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2443,22 +2433,22 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, #define HAS_SCALEROWDOWN38_SSSE3 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%3),%%xmm4\n" - "movdqa (%4),%%xmm5\n" + asm volatile ( + "movdqa (%3),%%xmm4 \n" + "movdqa (%4),%%xmm5 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa 0x10(%0),%%xmm1\n" - "lea 0x20(%0),%0\n" - "pshufb %%xmm4,%%xmm0\n" - "pshufb %%xmm5,%%xmm1\n" - "paddusb %%xmm1,%%xmm0\n" - "movq %%xmm0,(%1)\n" - "movhlps %%xmm0,%%xmm1\n" - "movd %%xmm1,0x8(%1)\n" - "lea 0xc(%1),%1\n" - "sub $0xc,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusb %%xmm1,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movhlps %%xmm0,%%xmm1 \n" + "movd %%xmm1,0x8(%1) \n" + "lea 0xc(%1),%1 \n" + "sub $0xc,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2470,50 +2460,50 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm4\n" - "movdqa (%5),%%xmm5\n" - "movdqa (%6),%%xmm6\n" - "pxor %%xmm7,%%xmm7\n" + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "pxor %%xmm7,%%xmm7 \n" "1:" - "movdqa (%0),%%xmm0\n" - "movdqa (%0,%3,1),%%xmm2\n" - "movhlps %%xmm0,%%xmm1\n" - "movhlps %%xmm2,%%xmm3\n" - "punpcklbw %%xmm7,%%xmm0\n" - "punpcklbw %%xmm7,%%xmm1\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpcklbw %%xmm7,%%xmm3\n" - "paddusw %%xmm2,%%xmm0\n" - "paddusw %%xmm3,%%xmm1\n" - "movdqa (%0,%3,2),%%xmm2\n" - "lea 0x10(%0),%0\n" - "movhlps %%xmm2,%%xmm3\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpcklbw %%xmm7,%%xmm3\n" - "paddusw %%xmm2,%%xmm0\n" - "paddusw %%xmm3,%%xmm1\n" - "movdqa %%xmm0,%%xmm2\n" - "psrldq $0x2,%%xmm0\n" - "paddusw %%xmm0,%%xmm2\n" - "psrldq $0x2,%%xmm0\n" - "paddusw %%xmm0,%%xmm2\n" - "pshufb %%xmm4,%%xmm2\n" - "movdqa %%xmm1,%%xmm3\n" - "psrldq $0x2,%%xmm1\n" - "paddusw %%xmm1,%%xmm3\n" - "psrldq $0x2,%%xmm1\n" - "paddusw %%xmm1,%%xmm3\n" - "pshufb %%xmm5,%%xmm3\n" - "paddusw %%xmm3,%%xmm2\n" - "pmulhuw %%xmm6,%%xmm2\n" - "packuswb %%xmm2,%%xmm2\n" - "movd %%xmm2,(%1)\n" - "pextrw $0x2,%%xmm2,%%eax\n" - "mov %%ax,0x4(%1)\n" - "lea 0x6(%1),%1\n" - "sub $0x6,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm0 \n" + "movdqa (%0,%3,1),%%xmm2 \n" + "movhlps %%xmm0,%%xmm1 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm1 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa (%0,%3,2),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movhlps %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpcklbw %%xmm7,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "psrldq $0x2,%%xmm0 \n" + "paddusw %%xmm0,%%xmm2 \n" + "pshufb %%xmm4,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "psrldq $0x2,%%xmm1 \n" + "paddusw %%xmm1,%%xmm3 \n" + "pshufb %%xmm5,%%xmm3 \n" + "paddusw %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm6,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movd %%xmm2,(%1) \n" + "pextrw $0x2,%%xmm2,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2527,30 +2517,30 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { - asm volatile( - "movdqa (%4),%%xmm4\n" - "movdqa (%5),%%xmm5\n" - "movdqa (%6),%%xmm6\n" - "movdqa (%7),%%xmm7\n" + asm volatile ( + "movdqa (%4),%%xmm4 \n" + "movdqa (%5),%%xmm5 \n" + "movdqa (%6),%%xmm6 \n" + "movdqa (%7),%%xmm7 \n" "1:" - "movdqa (%0),%%xmm2\n" - "pavgb (%0,%3,1),%%xmm2\n" - "lea 0x10(%0),%0\n" - "movdqa %%xmm2,%%xmm0\n" - "pshufb %%xmm4,%%xmm0\n" - "movdqa %%xmm2,%%xmm1\n" - "pshufb %%xmm5,%%xmm1\n" - "paddusw %%xmm1,%%xmm0\n" - "pshufb %%xmm6,%%xmm2\n" - "paddusw %%xmm2,%%xmm0\n" - "pmulhuw %%xmm7,%%xmm0\n" - "packuswb %%xmm0,%%xmm0\n" - "movd %%xmm0,(%1)\n" - "pextrw $0x2,%%xmm0,%%eax\n" - "mov %%ax,0x4(%1)\n" - "lea 0x6(%1),%1\n" - "sub $0x6,%2\n" - "ja 1b\n" + "movdqa (%0),%%xmm2 \n" + "pavgb (%0,%3,1),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "paddusw %%xmm1,%%xmm0 \n" + "pshufb %%xmm6,%%xmm2 \n" + "paddusw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movd %%xmm0,(%1) \n" + "pextrw $0x2,%%xmm0,%%eax \n" + "mov %%ax,0x4(%1) \n" + "lea 0x6(%1),%1 \n" + "sub $0x6,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -2567,33 +2557,33 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, uint16* dst_ptr, int src_width, int src_height) { - asm volatile( - "pxor %%xmm5,%%xmm5\n" + asm volatile ( + "pxor %%xmm5,%%xmm5 \n" "1:" - "movdqa (%0),%%xmm2\n" - "lea (%0,%4,1),%%r10\n" - "movhlps %%xmm2,%%xmm3\n" - "lea -0x1(%3),%%r11\n" - "punpcklbw %%xmm5,%%xmm2\n" - "punpcklbw %%xmm5,%%xmm3\n" + "movdqa (%0),%%xmm2 \n" + "lea (%0,%4,1),%%r10 \n" + "movhlps %%xmm2,%%xmm3 \n" + "lea -0x1(%3),%%r11 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm3 \n" "2:" - "movdqa (%%r10),%%xmm0\n" - "lea (%%r10,%4,1),%%r10\n" - "movhlps %%xmm0,%%xmm1\n" - "punpcklbw %%xmm5,%%xmm0\n" - "punpcklbw %%xmm5,%%xmm1\n" - "paddusw %%xmm0,%%xmm2\n" - "paddusw %%xmm1,%%xmm3\n" - "sub $0x1,%%r11\n" - "ja 2b\n" + "movdqa (%%r10),%%xmm0 \n" + "lea (%%r10,%4,1),%%r10 \n" + "movhlps %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm1 \n" + "paddusw %%xmm0,%%xmm2 \n" + "paddusw %%xmm1,%%xmm3 \n" + "sub $0x1,%%r11 \n" + "ja 2b \n" - "movdqa %%xmm2,(%1)\n" - "movdqa %%xmm3,0x10(%1)\n" - "lea 0x20(%1),%1\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" + "movdqa %%xmm2,(%1) \n" + "movdqa %%xmm3,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width), // %2 @@ -2609,16 +2599,16 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction) { if (source_y_fraction == 0) { - asm volatile( + asm volatile ( "1:" - "movdqa (%1),%%xmm0\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width) // %2 @@ -2627,18 +2617,18 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ); return; } else if (source_y_fraction == 128) { - asm volatile( + asm volatile ( "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%3,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "pavgb %%xmm2,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width) // %2 @@ -2647,42 +2637,42 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, ); return; } else { - asm volatile( - "mov %3,%%eax\n" - "movd %%eax,%%xmm6\n" - "punpcklwd %%xmm6,%%xmm6\n" - "pshufd $0x0,%%xmm6,%%xmm6\n" - "neg %%eax\n" - "add $0x100,%%eax\n" - "movd %%eax,%%xmm5\n" - "punpcklwd %%xmm5,%%xmm5\n" - "pshufd $0x0,%%xmm5,%%xmm5\n" - "pxor %%xmm7,%%xmm7\n" + asm volatile ( + "mov %3,%%eax \n" + "movd %%eax,%%xmm6 \n" + "punpcklwd %%xmm6,%%xmm6 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "neg %%eax \n" + "add $0x100,%%eax \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pxor %%xmm7,%%xmm7 \n" "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%4,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,%%xmm1\n" - "movdqa %%xmm2,%%xmm3\n" - "punpcklbw %%xmm7,%%xmm0\n" - "punpcklbw %%xmm7,%%xmm2\n" - "punpckhbw %%xmm7,%%xmm1\n" - "punpckhbw %%xmm7,%%xmm3\n" - "pmullw %%xmm5,%%xmm0\n" - "pmullw %%xmm5,%%xmm1\n" - "pmullw %%xmm6,%%xmm2\n" - "pmullw %%xmm6,%%xmm3\n" - "paddusw %%xmm2,%%xmm0\n" - "paddusw %%xmm3,%%xmm1\n" - "psrlw $0x8,%%xmm0\n" - "psrlw $0x8,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm7,%%xmm0 \n" + "punpcklbw %%xmm7,%%xmm2 \n" + "punpckhbw %%xmm7,%%xmm1 \n" + "punpckhbw %%xmm7,%%xmm3 \n" + "pmullw %%xmm5,%%xmm0 \n" + "pmullw %%xmm5,%%xmm1 \n" + "pmullw %%xmm6,%%xmm2 \n" + "pmullw %%xmm6,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 @@ -2700,16 +2690,16 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int src_stride, int dst_width, int source_y_fraction) { if (source_y_fraction == 0) { - asm volatile( + asm volatile ( "1:" - "movdqa (%1),%%xmm0\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" + "movdqa (%1),%%xmm0 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width) // %2 @@ -2718,18 +2708,18 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ); return; } else if (source_y_fraction == 128) { - asm volatile( + asm volatile ( "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%3,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "pavgb %%xmm2,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%3,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "pavgb %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width) // %2 @@ -2738,33 +2728,33 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, ); return; } else { - asm volatile( - "mov %3,%%eax\n" - "shr %%eax\n" - "mov %%al,%%ah\n" - "neg %%al\n" - "add $0x80,%%al\n" - "movd %%eax,%%xmm5\n" - "punpcklwd %%xmm5,%%xmm5\n" - "pshufd $0x0,%%xmm5,%%xmm5\n" + asm volatile ( + "mov %3,%%eax \n" + "shr %%eax \n" + "mov %%al,%%ah \n" + "neg %%al \n" + "add $0x80,%%al \n" + "movd %%eax,%%xmm5 \n" + "punpcklwd %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" "1:" - "movdqa (%1),%%xmm0\n" - "movdqa (%1,%4,1),%%xmm2\n" - "lea 0x10(%1),%1\n" - "movdqa %%xmm0,%%xmm1\n" - "punpcklbw %%xmm2,%%xmm0\n" - "punpckhbw %%xmm2,%%xmm1\n" - "pmaddubsw %%xmm5,%%xmm0\n" - "pmaddubsw %%xmm5,%%xmm1\n" - "psrlw $0x7,%%xmm0\n" - "psrlw $0x7,%%xmm1\n" - "packuswb %%xmm1,%%xmm0\n" - "movdqa %%xmm0,(%0)\n" - "lea 0x10(%0),%0\n" - "sub $0x10,%2\n" - "ja 1b\n" - "mov -0x1(%0),%%al\n" - "mov %%al,(%0)\n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "pmaddubsw %%xmm5,%%xmm0 \n" + "pmaddubsw %%xmm5,%%xmm1 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,(%0) \n" + "lea 0x10(%0),%0 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + "mov -0x1(%0),%%al \n" + "mov %%al,(%0) \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2