From 1f461f73d861d260fb3f7195fbbd0a85f0292612 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 4 Aug 2015 17:00:03 -0700 Subject: [PATCH] remove align directives R=harryjin@google.com BUG=none Review URL: https://webrtc-codereview.appspot.com/54809004. --- source/compare_neon.cc | 1 - source/compare_neon64.cc | 1 - source/rotate_gcc.cc | 6 +-- source/rotate_neon.cc | 2 - source/row_gcc.cc | 95 ---------------------------------------- source/row_mips.cc | 7 --- source/row_neon.cc | 80 --------------------------------- source/scale_mips.cc | 10 ----- source/scale_neon.cc | 20 --------- 9 files changed, 3 insertions(+), 219 deletions(-) diff --git a/source/compare_neon.cc b/source/compare_neon.cc index ef006ec41..5b27407ef 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -27,7 +27,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "vmov.u8 q9, #0 \n" "vmov.u8 q11, #0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 6d1e5e1bc..b3d55c422 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -26,7 +26,6 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "eor v17.16b, v17.16b, v17.16b \n" "eor v19.16b, v19.16b, v19.16b \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 7ed830362..72b17f3ef 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 2 \n" + LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "movq (%0,%3),%%xmm1 \n" @@ -114,7 +114,7 @@ void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 2 \n" + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%0,%3),%%xmm1 \n" @@ -256,7 +256,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride, asm volatile ( // Read in the data from the source pointer. // First round of bit swap. - ".p2align 2 \n" + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%0,%4),%%xmm1 \n" diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 76043b3b3..9e4ecd80d 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -35,7 +35,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, "sub %5, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - ".p2align 2 \n" "1: \n" "mov %0, %1 \n" @@ -256,7 +255,6 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, "sub %7, #8 \n" // handle 8x8 blocks. this should be the majority of the plane - ".p2align 2 \n" "1: \n" "mov %0, %1 \n" diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 820de0a1c..72949e076 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -141,101 +141,6 @@ static uvec8 kShuffleMaskARGBToRAW_0 = { }; #endif // HAS_RGB24TOARGBROW_SSSE3 -#if defined(TESTING) && defined(__x86_64__) -void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { - asm volatile ( - ".p2align 5 \n" - "mov %%eax,%%eax \n" - "mov %%ebx,%%ebx \n" - "mov %%ecx,%%ecx \n" - "mov %%edx,%%edx \n" - "mov %%esi,%%esi \n" - "mov %%edi,%%edi \n" - "mov %%ebp,%%ebp \n" - "mov %%esp,%%esp \n" - ".p2align 5 \n" - "mov %%r8d,%%r8d \n" - "mov %%r9d,%%r9d \n" - "mov %%r10d,%%r10d \n" - "mov %%r11d,%%r11d \n" - "mov %%r12d,%%r12d \n" - "mov %%r13d,%%r13d \n" - "mov %%r14d,%%r14d \n" - "mov %%r15d,%%r15d \n" - ".p2align 5 \n" - "lea (%%rax),%%eax \n" - "lea (%%rbx),%%ebx \n" - "lea (%%rcx),%%ecx \n" - "lea (%%rdx),%%edx \n" - "lea (%%rsi),%%esi \n" - "lea (%%rdi),%%edi \n" - "lea (%%rbp),%%ebp \n" - "lea (%%rsp),%%esp \n" - ".p2align 5 \n" - "lea (%%r8),%%r8d \n" - "lea (%%r9),%%r9d \n" - "lea (%%r10),%%r10d \n" - "lea (%%r11),%%r11d \n" - "lea (%%r12),%%r12d \n" - "lea (%%r13),%%r13d \n" - "lea (%%r14),%%r14d \n" - "lea (%%r15),%%r15d \n" - - ".p2align 5 \n" - "lea 0x10(%%rax),%%eax \n" - "lea 0x10(%%rbx),%%ebx \n" - "lea 0x10(%%rcx),%%ecx \n" - "lea 0x10(%%rdx),%%edx \n" - "lea 0x10(%%rsi),%%esi \n" - "lea 0x10(%%rdi),%%edi \n" - "lea 0x10(%%rbp),%%ebp \n" - "lea 0x10(%%rsp),%%esp \n" - ".p2align 5 \n" - "lea 0x10(%%r8),%%r8d \n" - "lea 0x10(%%r9),%%r9d \n" - "lea 0x10(%%r10),%%r10d \n" - "lea 0x10(%%r11),%%r11d \n" - "lea 0x10(%%r12),%%r12d \n" - "lea 0x10(%%r13),%%r13d \n" - "lea 0x10(%%r14),%%r14d \n" - "lea 0x10(%%r15),%%r15d \n" - - ".p2align 5 \n" - "add 0x10,%%eax \n" - "add 0x10,%%ebx \n" - "add 0x10,%%ecx \n" - "add 0x10,%%edx \n" - "add 0x10,%%esi \n" - "add 0x10,%%edi \n" - "add 0x10,%%ebp \n" - "add 0x10,%%esp \n" - ".p2align 5 \n" - "add 0x10,%%r8d \n" - "add 0x10,%%r9d \n" - "add 0x10,%%r10d \n" - "add 0x10,%%r11d \n" - "add 0x10,%%r12d \n" - "add 0x10,%%r13d \n" - "add 0x10,%%r14d \n" - "add 0x10,%%r15d \n" - - ".p2align 2 \n" - "1: \n" - "movq " MEMACCESS(0) ",%%xmm0 \n" - "lea " MEMLEA(0x8,0) ",%0 \n" - "movdqu %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm5" - ); -} -#endif // TESTING - #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( diff --git a/source/row_mips.cc b/source/row_mips.cc index cfc9ffe03..1183c7183 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -389,7 +389,6 @@ void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "blez $t4, 2f \n" " andi %[width], %[width], 0xf \n" // residual - ".p2align 2 \n" "1: \n" "addiu $t4, $t4, -1 \n" "lw $t0, 0(%[src_uv]) \n" // V1 | U1 | V0 | U0 @@ -457,7 +456,6 @@ void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { "blez $t4, 2f \n" " addu %[src], %[src], %[width] \n" // src += width - ".p2align 2 \n" "1: \n" "lw $t0, -16(%[src]) \n" // |3|2|1|0| "lw $t1, -12(%[src]) \n" // |7|6|5|4| @@ -512,7 +510,6 @@ void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "blez %[x], 2f \n" " addu %[src_uv], %[src_uv], $t4 \n" - ".p2align 2 \n" "1: \n" "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| @@ -673,7 +670,6 @@ void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, "lui $s6, 0xff00 \n" "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| - ".p2align 2 \n" "1: \n" I422ToTransientMipsRGB // Arranging into argb format @@ -735,7 +731,6 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, "lui $s6, 0xff00 \n" "ori $s6, 0xff00 \n" // |ff|00|ff|00| - ".p2align 2 \n" "1: \n" I422ToTransientMipsRGB // Arranging into abgr format @@ -797,7 +792,6 @@ void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, "lui $s6, 0xff \n" "ori $s6, 0xff \n" // |00|ff|00|ff| - ".p2align 2 \n" "1: \n" I422ToTransientMipsRGB // Arranging into bgra format @@ -857,7 +851,6 @@ void InterpolateRow_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr, "replv.ph $t0, %[y0_fraction] \n" "replv.ph $t1, %[source_y_fraction] \n" - ".p2align 2 \n" "1: \n" "lw $t2, 0(%[src_ptr]) \n" "lw $t3, 0(%[src_ptr1]) \n" diff --git a/source/row_neon.cc b/source/row_neon.cc index 1a72eb903..4298a3622 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -174,7 +174,6 @@ void I444ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV444 YUV422TORGB @@ -204,7 +203,6 @@ void I422ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -234,7 +232,6 @@ void I411ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV411 YUV422TORGB @@ -264,7 +261,6 @@ void I422ToBGRARow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -295,7 +291,6 @@ void I422ToABGRRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -326,7 +321,6 @@ void I422ToRGBARow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -356,7 +350,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -385,7 +378,6 @@ void I422ToRAWRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -427,7 +419,6 @@ void I422ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -472,7 +463,6 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -513,7 +503,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, asm volatile ( YUV422TORGB_SETUP_REG "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" "1: \n" READYUV422 YUV422TORGB @@ -542,7 +531,6 @@ void I400ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUV400 YUV422TORGB @@ -568,7 +556,6 @@ void J400ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( "vmov.u8 d23, #255 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d20}, [%0]! \n" @@ -592,7 +579,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READNV12 YUV422TORGB @@ -620,7 +606,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READNV21 YUV422TORGB @@ -648,7 +633,6 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READNV12 YUV422TORGB @@ -676,7 +660,6 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READNV21 YUV422TORGB @@ -703,7 +686,6 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READYUY2 YUV422TORGB @@ -729,7 +711,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, int width) { asm volatile ( YUV422TORGB_SETUP_REG - ".p2align 2 \n" "1: \n" READUYVY YUV422TORGB @@ -754,7 +735,6 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV @@ -777,7 +757,6 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load U @@ -800,7 +779,6 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 @@ -855,7 +833,6 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2 \n" "sub %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 @@ -882,7 +859,6 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "add %0, %0, %3, lsl #1 \n" "sub %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 @@ -909,7 +885,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2, lsl #2 \n" "sub %0, #16 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 @@ -931,7 +906,6 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. @@ -950,7 +924,6 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d4, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. @@ -982,7 +955,6 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -1030,7 +1002,6 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -1061,7 +1032,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, int pix) { asm volatile ( "vmov.u8 d3, #255 \n" // Alpha - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -1080,7 +1050,6 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. @@ -1098,7 +1067,6 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. @@ -1117,7 +1085,6 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. @@ -1135,7 +1102,6 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. @@ -1154,7 +1120,6 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. @@ -1176,7 +1141,6 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. @@ -1199,7 +1163,6 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // stride + src_yuy2 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. @@ -1227,7 +1190,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // stride + src_uyvy - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. @@ -1279,7 +1241,6 @@ void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_yuy2, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys @@ -1306,7 +1267,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_v, uint8* dst_uyvy, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys @@ -1330,7 +1290,6 @@ void I422ToUYVYRow_NEON(const uint8* src_y, void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. @@ -1350,7 +1309,6 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width) { asm volatile ( - ".p2align 2 \n" "vdup.32 d2, %2 \n" // dither4 "1: \n" MEMACCESS(1) @@ -1374,7 +1332,6 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int pix) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. @@ -1395,7 +1352,6 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, int pix) { asm volatile ( "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. @@ -1418,7 +1374,6 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1444,7 +1399,6 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1474,7 +1428,6 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1516,7 +1469,6 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1566,7 +1518,6 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1644,7 +1595,6 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1694,7 +1644,6 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -1743,7 +1692,6 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. @@ -1792,7 +1740,6 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. @@ -1841,7 +1788,6 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. @@ -1890,7 +1836,6 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. @@ -1939,7 +1884,6 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. @@ -1989,7 +1933,6 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -2059,7 +2002,6 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -2129,7 +2071,6 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -2194,7 +2135,6 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -2222,7 +2162,6 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -2250,7 +2189,6 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -2278,7 +2216,6 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. @@ -2305,7 +2242,6 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. @@ -2332,7 +2268,6 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. @@ -2359,7 +2294,6 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. @@ -2386,7 +2320,6 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. @@ -2605,7 +2538,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. @@ -2648,7 +2580,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. @@ -2684,7 +2615,6 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2721,7 +2651,6 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d30, #50 \n" // BR coefficient - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. @@ -2760,7 +2689,6 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. @@ -2820,7 +2748,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -2854,7 +2781,6 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2881,7 +2807,6 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2913,7 +2838,6 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. @@ -2940,7 +2864,6 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { asm volatile ( // 16 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. @@ -2970,7 +2893,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, asm volatile ( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. @@ -2997,7 +2919,6 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0],%5 \n" // top @@ -3041,7 +2962,6 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0],%4 \n" // left diff --git a/source/scale_mips.cc b/source/scale_mips.cc index 3eb4f27c4..2298a74b9 100644 --- a/source/scale_mips.cc +++ b/source/scale_mips.cc @@ -31,7 +31,6 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "beqz $t9, 2f \n" " nop \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -90,7 +89,6 @@ void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "bltz $t9, 2f \n" " nop \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -188,7 +186,6 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "beqz $t9, 2f \n" " nop \n" - ".p2align 2 \n" "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -248,7 +245,6 @@ void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, "srl $t9, %[dst_width], 1 \n" "andi $t8, %[dst_width], 1 \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 0(%[s1]) \n" // |7|6|5|4| @@ -319,7 +315,6 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -368,7 +363,6 @@ void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ".set noreorder \n" "repl.ph $t3, 3 \n" // 0x00030003 - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| @@ -425,7 +419,6 @@ void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ".set noreorder \n" "repl.ph $t2, 3 \n" // 0x00030003 - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| @@ -477,7 +470,6 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| @@ -528,7 +520,6 @@ void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| @@ -586,7 +577,6 @@ void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, ".set push \n" ".set noreorder \n" - ".p2align 2 \n" "1: \n" "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 7825878e9..10856cf84 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -26,7 +26,6 @@ extern "C" { void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 MEMACCESS(0) @@ -47,7 +46,6 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc @@ -73,7 +71,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( // change the stride to row 2 pointer "add %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc @@ -101,7 +98,6 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -123,7 +119,6 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, const uint8* src_ptr2 = src_ptr + src_stride * 2; const uint8* src_ptr3 = src_ptr + src_stride * 3; asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load up 16x4 @@ -162,7 +157,6 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -185,7 +179,6 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -245,7 +238,6 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, asm volatile ( "vmov.u8 d24, #3 \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 @@ -300,7 +292,6 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, asm volatile ( MEMACCESS(3) "vld1.8 {q3}, [%3] \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" @@ -334,7 +325,6 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, MEMACCESS(7) "vld1.8 {q15}, [%7] \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 @@ -450,7 +440,6 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, MEMACCESS(5) "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" - ".p2align 2 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 @@ -545,7 +534,6 @@ void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { const uint8* src_tmp = NULL; asm volatile ( - ".p2align 2 \n" "1: \n" "mov %0, %1 \n" "mov r12, %5 \n" @@ -590,7 +578,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, int* tmp = dx_offset; const uint8* src_tmp = src_ptr; asm volatile ( - ".p2align 2 \n" "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx "vld1.32 {q2}, [%5] \n" // 0 1 2 3 @@ -749,7 +736,6 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" // load even pixels into q0, odd into q1 MEMACCESS(0) @@ -773,7 +759,6 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { asm volatile ( - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -804,7 +789,6 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( // change the stride to row 2 pointer "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -845,7 +829,6 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( "mov r12, %3, lsl #2 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.32 {d0[0]}, [%0], r12 \n" @@ -875,7 +858,6 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, asm volatile ( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" - ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 @@ -930,7 +912,6 @@ void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, int tmp = 0; const uint8* src_tmp = src_argb; asm volatile ( - ".p2align 2 \n" "1: \n" LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 1) @@ -974,7 +955,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, int* tmp = dx_offset; const uint8* src_tmp = src_argb; asm volatile ( - ".p2align 2 \n" "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx "vld1.32 {q2}, [%5] \n" // 0 1 2 3