diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h index 57a6a12e9..26845b602 100644 --- a/include/libyuv/cpu_id.h +++ b/include/libyuv/cpu_id.h @@ -67,7 +67,6 @@ static const int kCpuHasLOONGARCH = 0x20; static const int kCpuHasLSX = 0x100; static const int kCpuHasLASX = 0x200; - // Optional init function. TestCpuFlag does an auto-init. // Returns cpu_info flags. LIBYUV_API diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 2b0845c16..fee2d2481 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -3613,9 +3613,9 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, int scale, int width); void Convert16To8Row_AVX512BW(const uint16_t* src_y, - uint8_t* dst_y, - int scale, - int width); + uint8_t* dst_y, + int scale, + int width); void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index a2eb82db8..b52a38a99 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -499,8 +499,8 @@ static inline void I422ToRGB565Row_SVE_SC( // Calculate a predicate for the final iteration to deal with the tail. "cnth %[vl] \n" "whilelt p1.b, wzr, %w[width] \n" // - READYUV422_SVE_2X I422TORGB_SVE_2X - RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X + READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X + RGB8TORGB565_SVE_FROM_TOP_2X "st2h {z18.h, z19.h}, p1, [%[dst]] \n" "99: \n" @@ -558,8 +558,8 @@ static inline void I422ToARGB1555Row_SVE_SC( // Calculate a predicate for the final iteration to deal with the tail. "cnth %[vl] \n" "whilelt p1.b, wzr, %w[width] \n" // - READYUV422_SVE_2X I422TORGB_SVE_2X - RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X + READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X + RGB8TOARGB1555_SVE_FROM_TOP_2X "st2h {z0.h, z1.h}, p1, [%[dst]] \n" "99: \n" @@ -617,8 +617,8 @@ static inline void I422ToARGB4444Row_SVE_SC( // Calculate a predicate for the final iteration to deal with the tail. "cnth %[vl] \n" "whilelt p1.b, wzr, %w[width] \n" // - READYUV422_SVE_2X I422TORGB_SVE_2X - RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X + READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X + RGB8TOARGB4444_SVE_FROM_TOP_2X "st2h {z0.h, z1.h}, p1, [%[dst]] \n" "99: \n" diff --git a/source/compare_gcc.cc b/source/compare_gcc.cc index 492969259..3838abd72 100644 --- a/source/compare_gcc.cc +++ b/source/compare_gcc.cc @@ -29,7 +29,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, int count) { uint64_t diff; - asm volatile ( + asm volatile( "xor %3,%3 \n" "xor %%r8,%%r8 \n" "xor %%r9,%%r9 \n" @@ -77,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, int count) { uint32_t diff = 0u; - asm volatile ( + asm volatile( // Process 16 bytes per loop. LABELALIGN "1: \n" @@ -121,7 +121,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a, int count) { uint32_t diff; - asm volatile ( + asm volatile( "movdqa %4,%%xmm2 \n" "movdqa %5,%%xmm3 \n" "pxor %%xmm0,%%xmm0 \n" @@ -180,7 +180,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a, int count) { uint32_t diff; - asm volatile ( + asm volatile( "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n" @@ -234,7 +234,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm volatile ( + asm volatile( "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" @@ -300,7 +300,7 @@ static const uvec32 kHashMul3 = { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; - asm volatile ( + asm volatile( "movd %2,%%xmm0 \n" "pxor %%xmm7,%%xmm7 \n" "movdqa %4,%%xmm6 \n" diff --git a/source/compare_neon.cc b/source/compare_neon.cc index c2aea6074..afdd60121 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, int count) { uint32_t diff; - asm volatile ( + asm volatile( "vmov.u16 q4, #0 \n" // accumulator "1: \n" @@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm volatile ( + asm volatile( "vmov.u8 q8, #0 \n" "vmov.u8 q10, #0 \n" "vmov.u8 q9, #0 \n" diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 07292deff..49246aaeb 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff; - asm volatile ( + asm volatile( "movi v4.8h, #0 \n" "1: \n" @@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; - asm volatile ( + asm volatile( "movi v16.16b, #0 \n" "movi v17.16b, #0 \n" "movi v18.16b, #0 \n" @@ -116,30 +116,30 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { uint32_t hash = seed; const uint32_t c16 = 0x92d9e201; // 33^16 uint32_t tmp, tmp2; - asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" - "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" + asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" + "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" // count is always a multiple of 16. // maintain two accumulators, reduce and then final sum in scalar since // this has better performance on little cores. - "1: \n" - "ldr q0, [%[src]], #16 \n" - "subs %w[count], %w[count], #16 \n" - "tbl v3.16b, {v0.16b}, v19.16b \n" - "tbl v2.16b, {v0.16b}, v18.16b \n" - "tbl v1.16b, {v0.16b}, v17.16b \n" - "tbl v0.16b, {v0.16b}, v16.16b \n" - "mul v3.4s, v3.4s, v7.4s \n" - "mul v2.4s, v2.4s, v6.4s \n" - "mla v3.4s, v1.4s, v5.4s \n" - "mla v2.4s, v0.4s, v4.4s \n" - "addv s1, v3.4s \n" - "addv s0, v2.4s \n" - "fmov %w[tmp2], s1 \n" - "fmov %w[tmp], s0 \n" - "add %w[tmp], %w[tmp], %w[tmp2] \n" - "madd %w[hash], %w[hash], %w[c16], %w[tmp] \n" - "b.gt 1b \n" + "1: \n" + "ldr q0, [%[src]], #16 \n" + "subs %w[count], %w[count], #16 \n" + "tbl v3.16b, {v0.16b}, v19.16b \n" + "tbl v2.16b, {v0.16b}, v18.16b \n" + "tbl v1.16b, {v0.16b}, v17.16b \n" + "tbl v0.16b, {v0.16b}, v16.16b \n" + "mul v3.4s, v3.4s, v7.4s \n" + "mul v2.4s, v2.4s, v6.4s \n" + "mla v3.4s, v1.4s, v5.4s \n" + "mla v2.4s, v0.4s, v4.4s \n" + "addv s1, v3.4s \n" + "addv s0, v2.4s \n" + "fmov %w[tmp2], s1 \n" + "fmov %w[tmp], s0 \n" + "add %w[tmp], %w[tmp], %w[tmp2] \n" + "madd %w[hash], %w[hash], %w[c16], %w[tmp] \n" + "b.gt 1b \n" : [hash] "+r"(hash), // %[hash] [count] "+r"(count), // %[count] [tmp] "=&r"(tmp), // %[tmp] @@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff; - asm volatile ( + asm volatile( "movi v4.4s, #0 \n" "movi v5.4s, #0 \n" "movi v6.16b, #1 \n" @@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a, int count) { // count is guaranteed to be a multiple of 32. uint32_t sse; - asm volatile ( + asm volatile( "movi v4.4s, #0 \n" "movi v5.4s, #0 \n" diff --git a/source/convert.cc b/source/convert.cc index bf886bc1c..0bdb8998f 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -665,7 +665,7 @@ int I010ToNV12(const uint16_t* src_y, void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) = Convert16To8Row_C; void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, - uint8_t* dst_uv, int width) = MergeUVRow_C; + uint8_t* dst_uv, int width) = MergeUVRow_C; if ((!src_y && dst_y) || !src_u || !src_v || !dst_uv || width <= 0 || height == 0) { return -1; diff --git a/source/convert_to_argb.cc b/source/convert_to_argb.cc index d0ff18a5e..72d21b042 100644 --- a/source/convert_to_argb.cc +++ b/source/convert_to_argb.cc @@ -70,9 +70,8 @@ int ConvertToARGB(const uint8_t* sample, uint8_t* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; - if (dst_argb == NULL || sample == NULL || - src_width <= 0 || src_width > INT_MAX / 4 || - crop_width <= 0 || crop_width > INT_MAX / 4 || + if (dst_argb == NULL || sample == NULL || src_width <= 0 || + src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 || src_height == 0 || crop_height == 0) { return -1; } @@ -81,7 +80,8 @@ int ConvertToARGB(const uint8_t* sample, } if (need_buf) { - const uint64_t rotate_buffer_size = (uint64_t)crop_width * 4 * abs_crop_height; + const uint64_t rotate_buffer_size = + (uint64_t)crop_width * 4 * abs_crop_height; if (rotate_buffer_size > SIZE_MAX) { return -1; // Invalid size. } diff --git a/source/convert_to_i420.cc b/source/convert_to_i420.cc index a2bc189be..aab071e1a 100644 --- a/source/convert_to_i420.cc +++ b/source/convert_to_i420.cc @@ -65,8 +65,9 @@ int ConvertToI420(const uint8_t* sample, const int inv_crop_height = (src_height < 0) ? -abs_crop_height : abs_crop_height; - if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || src_width > INT_MAX / 4 || - crop_width <= 0 || src_height == 0 || crop_height == 0) { + if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || + src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 || + crop_height == 0) { return -1; } @@ -78,7 +79,8 @@ int ConvertToI420(const uint8_t* sample, if (need_buf) { int y_size = crop_width * abs_crop_height; int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); - const uint64_t rotate_buffer_size = (uint64_t)y_size + (uint64_t)uv_size * 2; + const uint64_t rotate_buffer_size = + (uint64_t)y_size + (uint64_t)uv_size * 2; if (rotate_buffer_size > SIZE_MAX) { return -1; // Invalid size. } diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index e253797e8..7fda09d43 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -191,7 +191,8 @@ static int ARGBRotate180(const uint8_t* src_argb, #endif #if defined(HAS_COPYROW_AVX512BW) if (TestCpuFlag(kCpuHasAVX512BW)) { - CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; + CopyRow = + IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; } #endif #if defined(HAS_COPYROW_ERMS) diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 48926b687..fd5eee05f 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile ( + asm volatile( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile ( + asm volatile( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src, uint8_t* dst_b, int dst_stride_b, int width) { - asm volatile ( + asm volatile( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN @@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile ( + asm volatile( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "movdqu (%0),%%xmm0 \n" // a b c d @@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src, uint8_t* dst, int dst_stride, int width) { - asm volatile ( + asm volatile( // Main loop transpose 2 blocks of 4x4. Read a column, write a row. "1: \n" "vmovdqu (%0),%%xmm0 \n" // a b c d diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 334a9f998..a16ef7266 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -27,57 +27,57 @@ void TransposeWx8_NEON(const uint8_t* src, int dst_stride, int width) { const uint8_t* temp; - asm volatile ( + asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %[width], #8 \n" + "sub %[width], #8 \n" - "1: \n" - "mov %[temp], %[src] \n" - "vld1.8 {d0}, [%[temp]], %[src_stride] \n" - "vld1.8 {d1}, [%[temp]], %[src_stride] \n" - "vld1.8 {d2}, [%[temp]], %[src_stride] \n" - "vld1.8 {d3}, [%[temp]], %[src_stride] \n" - "vld1.8 {d4}, [%[temp]], %[src_stride] \n" - "vld1.8 {d5}, [%[temp]], %[src_stride] \n" - "vld1.8 {d6}, [%[temp]], %[src_stride] \n" - "vld1.8 {d7}, [%[temp]] \n" - "add %[src], #8 \n" + "1: \n" + "mov %[temp], %[src] \n" + "vld1.8 {d0}, [%[temp]], %[src_stride] \n" + "vld1.8 {d1}, [%[temp]], %[src_stride] \n" + "vld1.8 {d2}, [%[temp]], %[src_stride] \n" + "vld1.8 {d3}, [%[temp]], %[src_stride] \n" + "vld1.8 {d4}, [%[temp]], %[src_stride] \n" + "vld1.8 {d5}, [%[temp]], %[src_stride] \n" + "vld1.8 {d6}, [%[temp]], %[src_stride] \n" + "vld1.8 {d7}, [%[temp]] \n" + "add %[src], #8 \n" - "vtrn.8 d1, d0 \n" - "vtrn.8 d3, d2 \n" - "vtrn.8 d5, d4 \n" - "vtrn.8 d7, d6 \n" - "subs %[width], #8 \n" + "vtrn.8 d1, d0 \n" + "vtrn.8 d3, d2 \n" + "vtrn.8 d5, d4 \n" + "vtrn.8 d7, d6 \n" + "subs %[width], #8 \n" - "vtrn.16 d1, d3 \n" - "vtrn.16 d0, d2 \n" - "vtrn.16 d5, d7 \n" - "vtrn.16 d4, d6 \n" + "vtrn.16 d1, d3 \n" + "vtrn.16 d0, d2 \n" + "vtrn.16 d5, d7 \n" + "vtrn.16 d4, d6 \n" - "vtrn.32 d1, d5 \n" - "vtrn.32 d0, d4 \n" - "vtrn.32 d3, d7 \n" - "vtrn.32 d2, d6 \n" + "vtrn.32 d1, d5 \n" + "vtrn.32 d0, d4 \n" + "vtrn.32 d3, d7 \n" + "vtrn.32 d2, d6 \n" - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" - "mov %[temp], %[dst] \n" - "vst1.8 {d1}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d0}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d3}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d2}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d5}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d4}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d7}, [%[temp]], %[dst_stride] \n" - "vst1.8 {d6}, [%[temp]] \n" - "add %[dst], %[dst], %[dst_stride], lsl #3 \n" + "mov %[temp], %[dst] \n" + "vst1.8 {d1}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d0}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d3}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d2}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d5}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d4}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d7}, [%[temp]], %[dst_stride] \n" + "vst1.8 {d6}, [%[temp]] \n" + "add %[dst], %[dst], %[dst_stride], lsl #3 \n" - "bge 1b \n" + "bge 1b \n" : [temp] "=&r"(temp), // %[temp] [src] "+r"(src), // %[src] [dst] "+r"(dst), // %[dst] @@ -95,72 +95,72 @@ void TransposeUVWx8_NEON(const uint8_t* src, int dst_stride_b, int width) { const uint8_t* temp; - asm volatile ( + asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %[width], #8 \n" + "sub %[width], #8 \n" - "1: \n" - "mov %[temp], %[src] \n" - "vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n" - "vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n" - "vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n" - "vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n" - "vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n" - "vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n" - "vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n" - "vld2.8 {d22, d23}, [%[temp]] \n" - "add %[src], #8*2 \n" + "1: \n" + "mov %[temp], %[src] \n" + "vld2.8 {d0, d1}, [%[temp]], %[src_stride] \n" + "vld2.8 {d2, d3}, [%[temp]], %[src_stride] \n" + "vld2.8 {d4, d5}, [%[temp]], %[src_stride] \n" + "vld2.8 {d6, d7}, [%[temp]], %[src_stride] \n" + "vld2.8 {d16, d17}, [%[temp]], %[src_stride] \n" + "vld2.8 {d18, d19}, [%[temp]], %[src_stride] \n" + "vld2.8 {d20, d21}, [%[temp]], %[src_stride] \n" + "vld2.8 {d22, d23}, [%[temp]] \n" + "add %[src], #8*2 \n" - "vtrn.8 q1, q0 \n" - "vtrn.8 q3, q2 \n" - "vtrn.8 q9, q8 \n" - "vtrn.8 q11, q10 \n" - "subs %[width], #8 \n" + "vtrn.8 q1, q0 \n" + "vtrn.8 q3, q2 \n" + "vtrn.8 q9, q8 \n" + "vtrn.8 q11, q10 \n" + "subs %[width], #8 \n" - "vtrn.16 q1, q3 \n" - "vtrn.16 q0, q2 \n" - "vtrn.16 q9, q11 \n" - "vtrn.16 q8, q10 \n" + "vtrn.16 q1, q3 \n" + "vtrn.16 q0, q2 \n" + "vtrn.16 q9, q11 \n" + "vtrn.16 q8, q10 \n" - "vtrn.32 q1, q9 \n" - "vtrn.32 q0, q8 \n" - "vtrn.32 q3, q11 \n" - "vtrn.32 q2, q10 \n" + "vtrn.32 q1, q9 \n" + "vtrn.32 q0, q8 \n" + "vtrn.32 q3, q11 \n" + "vtrn.32 q2, q10 \n" - "vrev16.8 q0, q0 \n" - "vrev16.8 q1, q1 \n" - "vrev16.8 q2, q2 \n" - "vrev16.8 q3, q3 \n" - "vrev16.8 q8, q8 \n" - "vrev16.8 q9, q9 \n" - "vrev16.8 q10, q10 \n" - "vrev16.8 q11, q11 \n" + "vrev16.8 q0, q0 \n" + "vrev16.8 q1, q1 \n" + "vrev16.8 q2, q2 \n" + "vrev16.8 q3, q3 \n" + "vrev16.8 q8, q8 \n" + "vrev16.8 q9, q9 \n" + "vrev16.8 q10, q10 \n" + "vrev16.8 q11, q11 \n" - "mov %[temp], %[dst_a] \n" - "vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n" - "vst1.8 {d20}, [%[temp]] \n" - "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" + "mov %[temp], %[dst_a] \n" + "vst1.8 {d2}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d0}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d6}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d4}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d18}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d16}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d22}, [%[temp]], %[dst_stride_a] \n" + "vst1.8 {d20}, [%[temp]] \n" + "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" - "mov %[temp], %[dst_b] \n" - "vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n" - "vst1.8 {d21}, [%[temp]] \n" - "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" + "mov %[temp], %[dst_b] \n" + "vst1.8 {d3}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d1}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d7}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d5}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d19}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d17}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d23}, [%[temp]], %[dst_stride_b] \n" + "vst1.8 {d21}, [%[temp]] \n" + "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" - "bge 1b \n" + "bge 1b \n" : [temp] "=&r"(temp), // %[temp] [src] "+r"(src), // %[src] [dst_a] "+r"(dst_a), // %[dst_a] @@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src, uint8_t* dst1 = dst + dst_stride; uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst3 = dst2 + dst_stride; - asm volatile ( + asm volatile( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index dbf08edac..4a5e181a6 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -27,104 +27,104 @@ void TransposeWx16_NEON(const uint8_t* src, int dst_stride, int width) { const uint8_t* src_temp; - asm volatile ( - "1: \n" - "mov %[src_temp], %[src] \n" + asm volatile( + "1: \n" + "mov %[src_temp], %[src] \n" - "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n" - "add %[src], %[src], #16 \n" + "add %[src], %[src], #16 \n" // Transpose bytes within each 2x2 block. - "trn1 v0.16b, v16.16b, v17.16b \n" - "trn2 v1.16b, v16.16b, v17.16b \n" - "trn1 v2.16b, v18.16b, v19.16b \n" - "trn2 v3.16b, v18.16b, v19.16b \n" - "trn1 v4.16b, v20.16b, v21.16b \n" - "trn2 v5.16b, v20.16b, v21.16b \n" - "trn1 v6.16b, v22.16b, v23.16b \n" - "trn2 v7.16b, v22.16b, v23.16b \n" - "trn1 v8.16b, v24.16b, v25.16b \n" - "trn2 v9.16b, v24.16b, v25.16b \n" - "trn1 v10.16b, v26.16b, v27.16b \n" - "trn2 v11.16b, v26.16b, v27.16b \n" - "trn1 v12.16b, v28.16b, v29.16b \n" - "trn2 v13.16b, v28.16b, v29.16b \n" - "trn1 v14.16b, v30.16b, v31.16b \n" - "trn2 v15.16b, v30.16b, v31.16b \n" + "trn1 v0.16b, v16.16b, v17.16b \n" + "trn2 v1.16b, v16.16b, v17.16b \n" + "trn1 v2.16b, v18.16b, v19.16b \n" + "trn2 v3.16b, v18.16b, v19.16b \n" + "trn1 v4.16b, v20.16b, v21.16b \n" + "trn2 v5.16b, v20.16b, v21.16b \n" + "trn1 v6.16b, v22.16b, v23.16b \n" + "trn2 v7.16b, v22.16b, v23.16b \n" + "trn1 v8.16b, v24.16b, v25.16b \n" + "trn2 v9.16b, v24.16b, v25.16b \n" + "trn1 v10.16b, v26.16b, v27.16b \n" + "trn2 v11.16b, v26.16b, v27.16b \n" + "trn1 v12.16b, v28.16b, v29.16b \n" + "trn2 v13.16b, v28.16b, v29.16b \n" + "trn1 v14.16b, v30.16b, v31.16b \n" + "trn2 v15.16b, v30.16b, v31.16b \n" // Transpose 2x2-byte blocks within each 4x4 block. - "trn1 v16.8h, v0.8h, v2.8h \n" - "trn1 v17.8h, v1.8h, v3.8h \n" - "trn2 v18.8h, v0.8h, v2.8h \n" - "trn2 v19.8h, v1.8h, v3.8h \n" - "trn1 v20.8h, v4.8h, v6.8h \n" - "trn1 v21.8h, v5.8h, v7.8h \n" - "trn2 v22.8h, v4.8h, v6.8h \n" - "trn2 v23.8h, v5.8h, v7.8h \n" - "trn1 v24.8h, v8.8h, v10.8h \n" - "trn1 v25.8h, v9.8h, v11.8h \n" - "trn2 v26.8h, v8.8h, v10.8h \n" - "trn2 v27.8h, v9.8h, v11.8h \n" - "trn1 v28.8h, v12.8h, v14.8h \n" - "trn1 v29.8h, v13.8h, v15.8h \n" - "trn2 v30.8h, v12.8h, v14.8h \n" - "trn2 v31.8h, v13.8h, v15.8h \n" + "trn1 v16.8h, v0.8h, v2.8h \n" + "trn1 v17.8h, v1.8h, v3.8h \n" + "trn2 v18.8h, v0.8h, v2.8h \n" + "trn2 v19.8h, v1.8h, v3.8h \n" + "trn1 v20.8h, v4.8h, v6.8h \n" + "trn1 v21.8h, v5.8h, v7.8h \n" + "trn2 v22.8h, v4.8h, v6.8h \n" + "trn2 v23.8h, v5.8h, v7.8h \n" + "trn1 v24.8h, v8.8h, v10.8h \n" + "trn1 v25.8h, v9.8h, v11.8h \n" + "trn2 v26.8h, v8.8h, v10.8h \n" + "trn2 v27.8h, v9.8h, v11.8h \n" + "trn1 v28.8h, v12.8h, v14.8h \n" + "trn1 v29.8h, v13.8h, v15.8h \n" + "trn2 v30.8h, v12.8h, v14.8h \n" + "trn2 v31.8h, v13.8h, v15.8h \n" - "subs %w[width], %w[width], #16 \n" + "subs %w[width], %w[width], #16 \n" // Transpose 4x4-byte blocks within each 8x8 block. - "trn1 v0.4s, v16.4s, v20.4s \n" - "trn1 v2.4s, v17.4s, v21.4s \n" - "trn1 v4.4s, v18.4s, v22.4s \n" - "trn1 v6.4s, v19.4s, v23.4s \n" - "trn2 v8.4s, v16.4s, v20.4s \n" - "trn2 v10.4s, v17.4s, v21.4s \n" - "trn2 v12.4s, v18.4s, v22.4s \n" - "trn2 v14.4s, v19.4s, v23.4s \n" - "trn1 v1.4s, v24.4s, v28.4s \n" - "trn1 v3.4s, v25.4s, v29.4s \n" - "trn1 v5.4s, v26.4s, v30.4s \n" - "trn1 v7.4s, v27.4s, v31.4s \n" - "trn2 v9.4s, v24.4s, v28.4s \n" - "trn2 v11.4s, v25.4s, v29.4s \n" - "trn2 v13.4s, v26.4s, v30.4s \n" - "trn2 v15.4s, v27.4s, v31.4s \n" + "trn1 v0.4s, v16.4s, v20.4s \n" + "trn1 v2.4s, v17.4s, v21.4s \n" + "trn1 v4.4s, v18.4s, v22.4s \n" + "trn1 v6.4s, v19.4s, v23.4s \n" + "trn2 v8.4s, v16.4s, v20.4s \n" + "trn2 v10.4s, v17.4s, v21.4s \n" + "trn2 v12.4s, v18.4s, v22.4s \n" + "trn2 v14.4s, v19.4s, v23.4s \n" + "trn1 v1.4s, v24.4s, v28.4s \n" + "trn1 v3.4s, v25.4s, v29.4s \n" + "trn1 v5.4s, v26.4s, v30.4s \n" + "trn1 v7.4s, v27.4s, v31.4s \n" + "trn2 v9.4s, v24.4s, v28.4s \n" + "trn2 v11.4s, v25.4s, v29.4s \n" + "trn2 v13.4s, v26.4s, v30.4s \n" + "trn2 v15.4s, v27.4s, v31.4s \n" // Transpose 8x8-byte blocks and store. - "st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n" - "st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n" - "st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src] "+r"(src), // %[src] [src_temp] "=&r"(src_temp), // %[src_temp] [dst] "+r"(dst), // %[dst] @@ -145,76 +145,76 @@ void TransposeUVWx8_NEON(const uint8_t* src, int dst_stride_b, int width) { const uint8_t* temp; - asm volatile ( + asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this - "sub %w[width], %w[width], #8 \n" + "sub %w[width], %w[width], #8 \n" - "1: \n" - "mov %[temp], %[src] \n" - "ld1 {v0.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v1.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v2.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v3.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v4.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v5.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v6.16b}, [%[temp]], %[src_stride] \n" - "ld1 {v7.16b}, [%[temp]] \n" - "add %[src], %[src], #16 \n" + "1: \n" + "mov %[temp], %[src] \n" + "ld1 {v0.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v1.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v2.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v3.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v4.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v5.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v6.16b}, [%[temp]], %[src_stride] \n" + "ld1 {v7.16b}, [%[temp]] \n" + "add %[src], %[src], #16 \n" - "trn1 v16.16b, v0.16b, v1.16b \n" - "trn2 v17.16b, v0.16b, v1.16b \n" - "trn1 v18.16b, v2.16b, v3.16b \n" - "trn2 v19.16b, v2.16b, v3.16b \n" - "trn1 v20.16b, v4.16b, v5.16b \n" - "trn2 v21.16b, v4.16b, v5.16b \n" - "trn1 v22.16b, v6.16b, v7.16b \n" - "trn2 v23.16b, v6.16b, v7.16b \n" + "trn1 v16.16b, v0.16b, v1.16b \n" + "trn2 v17.16b, v0.16b, v1.16b \n" + "trn1 v18.16b, v2.16b, v3.16b \n" + "trn2 v19.16b, v2.16b, v3.16b \n" + "trn1 v20.16b, v4.16b, v5.16b \n" + "trn2 v21.16b, v4.16b, v5.16b \n" + "trn1 v22.16b, v6.16b, v7.16b \n" + "trn2 v23.16b, v6.16b, v7.16b \n" - "subs %w[width], %w[width], #8 \n" + "subs %w[width], %w[width], #8 \n" - "trn1 v0.8h, v16.8h, v18.8h \n" - "trn2 v1.8h, v16.8h, v18.8h \n" - "trn1 v2.8h, v20.8h, v22.8h \n" - "trn2 v3.8h, v20.8h, v22.8h \n" - "trn1 v4.8h, v17.8h, v19.8h \n" - "trn2 v5.8h, v17.8h, v19.8h \n" - "trn1 v6.8h, v21.8h, v23.8h \n" - "trn2 v7.8h, v21.8h, v23.8h \n" + "trn1 v0.8h, v16.8h, v18.8h \n" + "trn2 v1.8h, v16.8h, v18.8h \n" + "trn1 v2.8h, v20.8h, v22.8h \n" + "trn2 v3.8h, v20.8h, v22.8h \n" + "trn1 v4.8h, v17.8h, v19.8h \n" + "trn2 v5.8h, v17.8h, v19.8h \n" + "trn1 v6.8h, v21.8h, v23.8h \n" + "trn2 v7.8h, v21.8h, v23.8h \n" - "trn1 v16.4s, v0.4s, v2.4s \n" - "trn2 v17.4s, v0.4s, v2.4s \n" - "trn1 v18.4s, v1.4s, v3.4s \n" - "trn2 v19.4s, v1.4s, v3.4s \n" - "trn1 v20.4s, v4.4s, v6.4s \n" - "trn2 v21.4s, v4.4s, v6.4s \n" - "trn1 v22.4s, v5.4s, v7.4s \n" - "trn2 v23.4s, v5.4s, v7.4s \n" + "trn1 v16.4s, v0.4s, v2.4s \n" + "trn2 v17.4s, v0.4s, v2.4s \n" + "trn1 v18.4s, v1.4s, v3.4s \n" + "trn2 v19.4s, v1.4s, v3.4s \n" + "trn1 v20.4s, v4.4s, v6.4s \n" + "trn2 v21.4s, v4.4s, v6.4s \n" + "trn1 v22.4s, v5.4s, v7.4s \n" + "trn2 v23.4s, v5.4s, v7.4s \n" - "mov %[temp], %[dst_a] \n" - "st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n" - "st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n" - "st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n" - "st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n" - "st1 {v19.d}[1], [%[temp]] \n" - "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" + "mov %[temp], %[dst_a] \n" + "st1 {v16.d}[0], [%[temp]], %[dst_stride_a] \n" + "st1 {v18.d}[0], [%[temp]], %[dst_stride_a] \n" + "st1 {v17.d}[0], [%[temp]], %[dst_stride_a] \n" + "st1 {v19.d}[0], [%[temp]], %[dst_stride_a] \n" + "st1 {v16.d}[1], [%[temp]], %[dst_stride_a] \n" + "st1 {v18.d}[1], [%[temp]], %[dst_stride_a] \n" + "st1 {v17.d}[1], [%[temp]], %[dst_stride_a] \n" + "st1 {v19.d}[1], [%[temp]] \n" + "add %[dst_a], %[dst_a], %[dst_stride_a], lsl #3 \n" - "mov %[temp], %[dst_b] \n" - "st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n" - "st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n" - "st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n" - "st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n" - "st1 {v23.d}[1], [%[temp]] \n" - "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" + "mov %[temp], %[dst_b] \n" + "st1 {v20.d}[0], [%[temp]], %[dst_stride_b] \n" + "st1 {v22.d}[0], [%[temp]], %[dst_stride_b] \n" + "st1 {v21.d}[0], [%[temp]], %[dst_stride_b] \n" + "st1 {v23.d}[0], [%[temp]], %[dst_stride_b] \n" + "st1 {v20.d}[1], [%[temp]], %[dst_stride_b] \n" + "st1 {v22.d}[1], [%[temp]], %[dst_stride_b] \n" + "st1 {v21.d}[1], [%[temp]], %[dst_stride_b] \n" + "st1 {v23.d}[1], [%[temp]] \n" + "add %[dst_b], %[dst_b], %[dst_stride_b], lsl #3 \n" - "b.ge 1b \n" + "b.ge 1b \n" : [temp] "=&r"(temp), // %[temp] [src] "+r"(src), // %[src] [dst_a] "+r"(dst_a), // %[dst_a] @@ -239,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src, uint8_t* dst1 = dst + dst_stride; uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst3 = dst2 + dst_stride; - asm volatile ( + asm volatile( // Main loop transpose 4x4. Read a column, write a row. "1: \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 2ec59759f..c2ad5b8f5 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -161,7 +161,7 @@ static const lvec8 kShuffleNV21 = { #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -192,7 +192,7 @@ void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -230,7 +230,7 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, } void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -269,7 +269,7 @@ void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff "psrld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" @@ -309,7 +309,7 @@ void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" "movdqa %5,%%xmm5 \n" @@ -339,7 +339,7 @@ void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, } void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" @@ -387,7 +387,7 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" @@ -438,7 +438,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "mov $0xf0f0f0f,%%eax \n" "movd %%eax,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" @@ -475,10 +475,9 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -505,18 +504,18 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRGB24) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRGB24) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( - "movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -543,11 +542,12 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskARGBToRAW) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskARGBToRAW) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); } #ifdef HAS_ARGBTORGB24ROW_AVX2 @@ -555,7 +555,7 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" @@ -615,7 +615,7 @@ static const ulvec8 kPermARGBToRGB24_2 = { 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vmovdqa %3,%%ymm5 \n" "vmovdqa %4,%%ymm6 \n" "vmovdqa %5,%%ymm7 \n" @@ -649,7 +649,7 @@ void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTORAWROW_AVX2 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" @@ -694,7 +694,7 @@ void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #endif void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm3,%%xmm3 \n" "psrld $0x1b,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" @@ -734,7 +734,7 @@ void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, uint32_t dither4, int width) { - asm volatile ( + asm volatile( "movd %3,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm6 \n" "movdqa %%xmm6,%%xmm7 \n" @@ -782,7 +782,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, uint32_t dither4, int width) { - asm volatile ( + asm volatile( "vbroadcastss %3,%%xmm6 \n" "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" "vpermq $0xd8,%%ymm6,%%ymm6 \n" @@ -824,7 +824,7 @@ void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, #endif // HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1b,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" @@ -865,7 +865,7 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" @@ -928,7 +928,7 @@ static const uint32_t kMaskAG10 = 0xc000ff00; static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 @@ -967,7 +967,7 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 @@ -1007,7 +1007,7 @@ void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 @@ -1044,7 +1044,7 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ABGRTOAR30ROW_AVX2 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 @@ -1090,7 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1112,7 +1112,7 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm2 \n" "movdqa %4,%%xmm3 \n" LABELALIGN "1: \n" @@ -1137,7 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1159,10 +1159,9 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile ( - "movdqa %3,%%xmm2 \n" + asm volatile("movdqa %3,%%xmm2 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1175,18 +1174,18 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #ifdef HAS_ARGBTOAR64ROW_AVX2 void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" @@ -1211,7 +1210,7 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb, void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm2 \n" "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN "1: \n" @@ -1239,7 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb, void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1265,8 +1264,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN + asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -1281,11 +1279,11 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif @@ -1360,7 +1358,7 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -1381,7 +1379,7 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -1399,7 +1397,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { // Convert 16 ABGR pixels (64 bytes) to 16 YJ values. // Same as ABGRToYRow but different coefficients, no add 16. void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -1417,7 +1415,7 @@ void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -1441,7 +1439,7 @@ static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" @@ -1462,7 +1460,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { #ifdef HAS_ABGRTOYROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" @@ -1483,7 +1481,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1502,7 +1500,7 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { #ifdef HAS_ABGRTOYJROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1521,7 +1519,7 @@ void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( @@ -1542,7 +1540,7 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" @@ -1615,7 +1613,7 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" @@ -1678,7 +1676,7 @@ void ABGRToUVRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" @@ -1741,7 +1739,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" @@ -1806,7 +1804,7 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" @@ -1870,7 +1868,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" @@ -1936,7 +1934,7 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" @@ -2001,7 +1999,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %4,%%xmm3 \n" "movdqa %5,%%xmm4 \n" "movdqa %6,%%xmm5 \n" @@ -2055,7 +2053,7 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, #endif // HAS_ARGBTOUV444ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2076,7 +2074,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" @@ -2135,7 +2133,7 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, } void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2152,7 +2150,7 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { } void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" @@ -2173,7 +2171,7 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" @@ -2236,7 +2234,7 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" @@ -2657,7 +2655,8 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA444 @@ -2983,7 +2982,8 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210 @@ -3015,7 +3015,8 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410 @@ -3081,7 +3082,8 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA422 @@ -3109,7 +3111,8 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV12 @@ -3130,7 +3133,8 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV21 @@ -3151,7 +3155,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( "movdqa %[kShuffleYUY2Y],%%xmm6 \n" "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3173,7 +3177,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( "movdqa %[kShuffleUYVYY],%%xmm6 \n" "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" @@ -3196,7 +3200,8 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP210 @@ -3217,7 +3222,8 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP( + asm volatile( + YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP410 @@ -4051,7 +4057,8 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210_AVX2 @@ -4086,7 +4093,8 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410_AVX2 @@ -4161,7 +4169,8 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA444_AVX2 @@ -4195,7 +4204,8 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA422_AVX2 @@ -4271,7 +4281,8 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READNV12_AVX2 @@ -4297,7 +4308,8 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READNV21_AVX2 @@ -4323,7 +4335,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n" "vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -4350,7 +4362,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n" "vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" @@ -4378,7 +4390,8 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READP210_AVX2 @@ -4404,7 +4417,8 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP_AVX2( + asm volatile( + YUVTORGB_SETUP_AVX2( yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READP410_AVX2 @@ -4501,7 +4515,7 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 @@ -4546,7 +4560,7 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 @@ -4590,10 +4604,9 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu -0x10(%0,%2,1),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -4601,21 +4614,20 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" @@ -4625,11 +4637,11 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirror) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_AVX2 @@ -4640,10 +4652,9 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu -0x10(%0,%2,2),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" @@ -4651,21 +4662,20 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "lea 0x10(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" @@ -4675,11 +4685,11 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_uv), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorUV) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_uv), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorUV) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORUVROW_AVX2 @@ -4692,7 +4702,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( + asm volatile( "movdqa %4,%%xmm1 \n" "lea -0x10(%0,%3,2),%0 \n" "sub %1,%2 \n" @@ -4732,7 +4742,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, int width) { intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -4767,10 +4777,9 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "lea -0x10(%0,%2,4),%0 \n" + asm volatile("lea -0x10(%0,%2,4),%0 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "pshufd $0x1b,%%xmm0,%%xmm0 \n" @@ -4779,11 +4788,11 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : - : "memory", "cc", "xmm0"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc", "xmm0"); } #endif // HAS_ARGBMIRRORROW_SSE2 @@ -4792,10 +4801,9 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); - asm volatile ( - "vmovdqu %3,%%ymm5 \n" + asm volatile("vmovdqu %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" @@ -4803,11 +4811,11 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(temp_width) // %2 - : "m"(kARGBShuffleMirror_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm5"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kARGBShuffleMirror_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_ARGBMIRRORROW_AVX2 @@ -4816,7 +4824,7 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -4854,7 +4862,7 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -4891,7 +4899,7 @@ void DetileRow_SSE2(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "sub $0x10,%2 \n" @@ -4912,7 +4920,7 @@ void DetileRow_16_SSE2(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -4935,7 +4943,7 @@ void DetileRow_16_AVX(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea (%0,%3,2),%0 \n" @@ -4960,7 +4968,7 @@ void DetileToYUY2_SSE2(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" // Load 16 Y "sub $0x10,%3 \n" @@ -4999,7 +5007,7 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "movdqu %4,%%xmm1 \n" "1: \n" "movdqu (%0),%%xmm0 \n" @@ -5026,10 +5034,9 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" + asm volatile("sub %0,%1 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%0),%%zmm0 \n" "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" @@ -5041,12 +5048,12 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX512BW @@ -5055,10 +5062,9 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" + asm volatile("sub %0,%1 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%0),%%ymm0 \n" "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" @@ -5070,12 +5076,12 @@ void MergeUVRow_AVX2(const uint8_t* src_u, "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 @@ -5084,10 +5090,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile ( - "sub %0,%1 \n" + asm volatile("sub %0,%1 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" @@ -5100,12 +5105,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u, "lea 0x20(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" - : "+r"(src_u), // %0 - "+r"(src_v), // %1 - "+r"(dst_uv), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_u), // %0 + "+r"(src_v), // %1 + "+r"(dst_uv), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 @@ -5115,7 +5120,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, uint16_t* dst_uv, int depth, int width) { - asm volatile ( + asm volatile( "vmovd %4,%%xmm3 \n" "vmovd %5,%%xmm4 \n" @@ -5154,7 +5159,7 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, int depth, int width) { depth = 16 - depth; - asm volatile ( + asm volatile( "vmovd %4,%%xmm3 \n" "vbroadcastf128 %5,%%ymm4 \n" "sub %1,%2 \n" @@ -5200,7 +5205,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "vmovd %3,%%xmm3 \n" "vpbroadcastw %%xmm3,%%ymm3 \n" "sub %0,%1 \n" @@ -5236,7 +5241,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "vmovd %3,%%xmm3 \n" "vpbroadcastw %%xmm3,%%ymm3 \n" "sub %0,%1 \n" @@ -5272,7 +5277,7 @@ void Convert16To8Row_SSSE3(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" @@ -5302,7 +5307,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "vmovd %3,%%xmm2 \n" "vpbroadcastw %%xmm2,%%ymm2 \n" @@ -5334,11 +5339,10 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm volatile ( - "vpbroadcastw %3,%%zmm2 \n" + asm volatile("vpbroadcastw %3,%%zmm2 \n" - // 64 pixels per loop. - LABELALIGN + // 64 pixels per loop. + LABELALIGN "1: \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" @@ -5353,11 +5357,11 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, "sub $0x40,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(scale) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_CONVERT16TO8ROW_AVX2 @@ -5369,7 +5373,7 @@ void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" @@ -5401,7 +5405,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "vmovd %3,%%xmm2 \n" "vpbroadcastw %%xmm2,%%ymm2 \n" @@ -5456,7 +5460,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -5514,35 +5518,38 @@ static const uvec8 kSplitRGBShuffleSSE41[5] = { {0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u, 0u, 128u, 0u}, }; -void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, - uint8_t* dst_g, uint8_t* dst_b, int width) { +void SplitRGBRow_SSE41(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { asm volatile( - "movdqa 48(%5), %%xmm0 \n" - "1: \n" - "movdqu (%0),%%xmm1 \n" - "movdqu 0x10(%0),%%xmm2 \n" - "movdqu 0x20(%0),%%xmm3 \n" - "lea 0x30(%0),%0 \n" - "movdqa %%xmm1, %%xmm4 \n" - "pblendvb %%xmm3, %%xmm1 \n" - "pblendvb %%xmm2, %%xmm3 \n" - "pblendvb %%xmm4, %%xmm2 \n" - "palignr $0xF, %%xmm0, %%xmm0 \n" - "pblendvb %%xmm2, %%xmm1 \n" - "pblendvb %%xmm3, %%xmm2 \n" - "pblendvb %%xmm4, %%xmm3 \n" - "palignr $0x1, %%xmm0, %%xmm0 \n" - "pshufb 0(%5), %%xmm1 \n" - "pshufb 16(%5), %%xmm2 \n" - "pshufb 32(%5), %%xmm3 \n" - "movdqu %%xmm1,(%1) \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm2,(%2) \n" - "lea 0x10(%2),%2 \n" - "movdqu %%xmm3,(%3) \n" - "lea 0x10(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "movdqa 48(%5), %%xmm0 \n" + "1: \n" + "movdqu (%0),%%xmm1 \n" + "movdqu 0x10(%0),%%xmm2 \n" + "movdqu 0x20(%0),%%xmm3 \n" + "lea 0x30(%0),%0 \n" + "movdqa %%xmm1, %%xmm4 \n" + "pblendvb %%xmm3, %%xmm1 \n" + "pblendvb %%xmm2, %%xmm3 \n" + "pblendvb %%xmm4, %%xmm2 \n" + "palignr $0xF, %%xmm0, %%xmm0 \n" + "pblendvb %%xmm2, %%xmm1 \n" + "pblendvb %%xmm3, %%xmm2 \n" + "pblendvb %%xmm4, %%xmm3 \n" + "palignr $0x1, %%xmm0, %%xmm0 \n" + "pshufb 0(%5), %%xmm1 \n" + "pshufb 16(%5), %%xmm2 \n" + "pshufb 32(%5), %%xmm3 \n" + "movdqu %%xmm1,(%1) \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm2,(%2) \n" + "lea 0x10(%2),%2 \n" + "movdqu %%xmm3,(%3) \n" + "lea 0x10(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -5554,50 +5561,53 @@ void SplitRGBRow_SSE41(const uint8_t* src_rgb, uint8_t* dst_r, #endif // HAS_SPLITRGBROW_SSE41 #ifdef HAS_SPLITRGBROW_AVX2 -void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, - uint8_t* dst_g, uint8_t* dst_b, int width) { +void SplitRGBRow_AVX2(const uint8_t* src_rgb, + uint8_t* dst_r, + uint8_t* dst_g, + uint8_t* dst_b, + int width) { asm volatile( - "vbroadcasti128 48(%5), %%ymm0 \n" - "vbroadcasti128 64(%5), %%ymm7 \n" + "vbroadcasti128 48(%5), %%ymm0 \n" + "vbroadcasti128 64(%5), %%ymm7 \n" #if defined(__x86_64__) - "vbroadcasti128 0(%5), %%ymm8 \n" - "vbroadcasti128 16(%5), %%ymm9 \n" - "vbroadcasti128 32(%5), %%ymm10 \n" + "vbroadcasti128 0(%5), %%ymm8 \n" + "vbroadcasti128 16(%5), %%ymm9 \n" + "vbroadcasti128 32(%5), %%ymm10 \n" #endif - "1: \n" - "vmovdqu (%0),%%ymm4 \n" - "vmovdqu 0x20(%0),%%ymm5 \n" - "vmovdqu 0x40(%0),%%ymm6 \n" - "lea 0x60(%0),%0 \n" - "vpblendd $240, %%ymm5, %%ymm4, %%ymm1 \n" - "vperm2i128 $33, %%ymm6, %%ymm4, %%ymm2 \n" - "vpblendd $240, %%ymm6, %%ymm5, %%ymm3 \n" - "vpblendvb %%ymm0, %%ymm3, %%ymm1, %%ymm4 \n" - "vpblendvb %%ymm0, %%ymm1, %%ymm2, %%ymm5 \n" - "vpblendvb %%ymm0, %%ymm2, %%ymm3, %%ymm6 \n" - "vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n" - "vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n" - "vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n" + "1: \n" + "vmovdqu (%0),%%ymm4 \n" + "vmovdqu 0x20(%0),%%ymm5 \n" + "vmovdqu 0x40(%0),%%ymm6 \n" + "lea 0x60(%0),%0 \n" + "vpblendd $240, %%ymm5, %%ymm4, %%ymm1 \n" + "vperm2i128 $33, %%ymm6, %%ymm4, %%ymm2 \n" + "vpblendd $240, %%ymm6, %%ymm5, %%ymm3 \n" + "vpblendvb %%ymm0, %%ymm3, %%ymm1, %%ymm4 \n" + "vpblendvb %%ymm0, %%ymm1, %%ymm2, %%ymm5 \n" + "vpblendvb %%ymm0, %%ymm2, %%ymm3, %%ymm6 \n" + "vpblendvb %%ymm7, %%ymm5, %%ymm4, %%ymm1 \n" + "vpblendvb %%ymm7, %%ymm6, %%ymm5, %%ymm2 \n" + "vpblendvb %%ymm7, %%ymm4, %%ymm6, %%ymm3 \n" #if defined(__x86_64__) - "vpshufb %%ymm8, %%ymm1, %%ymm1 \n" - "vpshufb %%ymm9, %%ymm2, %%ymm2 \n" - "vpshufb %%ymm10, %%ymm3, %%ymm3 \n" + "vpshufb %%ymm8, %%ymm1, %%ymm1 \n" + "vpshufb %%ymm9, %%ymm2, %%ymm2 \n" + "vpshufb %%ymm10, %%ymm3, %%ymm3 \n" #else - "vbroadcasti128 0(%5), %%ymm4 \n" - "vbroadcasti128 16(%5), %%ymm5 \n" - "vbroadcasti128 32(%5), %%ymm6 \n" - "vpshufb %%ymm4, %%ymm1, %%ymm1 \n" - "vpshufb %%ymm5, %%ymm2, %%ymm2 \n" - "vpshufb %%ymm6, %%ymm3, %%ymm3 \n" + "vbroadcasti128 0(%5), %%ymm4 \n" + "vbroadcasti128 16(%5), %%ymm5 \n" + "vbroadcasti128 32(%5), %%ymm6 \n" + "vpshufb %%ymm4, %%ymm1, %%ymm1 \n" + "vpshufb %%ymm5, %%ymm2, %%ymm2 \n" + "vpshufb %%ymm6, %%ymm3, %%ymm3 \n" #endif - "vmovdqu %%ymm1,(%1) \n" - "lea 0x20(%1),%1 \n" - "vmovdqu %%ymm2,(%2) \n" - "lea 0x20(%2),%2 \n" - "vmovdqu %%ymm3,(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" + "vmovdqu %%ymm1,(%1) \n" + "lea 0x20(%1),%1 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "vmovdqu %%ymm3,(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 @@ -5607,7 +5617,8 @@ void SplitRGBRow_AVX2(const uint8_t* src_rgb, uint8_t* dst_r, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #if defined(__x86_64__) - , "xmm8", "xmm9", "xmm10" + , + "xmm8", "xmm9", "xmm10" #endif ); } @@ -5640,7 +5651,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" @@ -5695,7 +5706,7 @@ void MergeARGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -5736,7 +5747,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "movq (%2),%%xmm0 \n" // B @@ -5774,7 +5785,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -5819,7 +5830,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%2),%%xmm0 \n" // B @@ -5861,7 +5872,7 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -5912,7 +5923,7 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F @@ -5961,7 +5972,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "movdqa %6,%%xmm3 \n" "sub %1,%2 \n" "sub %1,%3 \n" @@ -5994,7 +6005,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); @@ -6007,7 +6018,7 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "movdqa %5,%%xmm3 \n" LABELALIGN @@ -6048,7 +6059,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" @@ -6085,7 +6096,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit), // %6 "m"(kShuffleMaskARGBPermute) // %7 @@ -6099,7 +6110,7 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "vmovdqa %6,%%ymm3 \n" "vbroadcastf128 %5,%%ymm4 \n" @@ -6146,7 +6157,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 10; - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants @@ -6194,7 +6205,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, #if defined(__i386__) : "m"(shift) // %5 #else - : "rm"(shift) // %5 + : "rm"(shift) // %5 #endif : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -6212,7 +6223,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -6263,7 +6274,7 @@ void MergeAR64Row_AVX2(const uint16_t* src_r, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(shift), // %6 "m"(mask), // %7 @@ -6283,7 +6294,7 @@ void MergeXR64Row_AVX2(const uint16_t* src_r, int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "vmovdqa %7,%%ymm5 \n" @@ -6346,7 +6357,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 8; - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -6386,7 +6397,7 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r, #if defined(__i386__) "+m"(width) // %5 #else - "+rm"(width) // %5 + "+rm"(width) // %5 #endif : "m"(shift), // %6 "m"(MergeARGB16To8Shuffle) // %7 @@ -6402,7 +6413,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, int depth, int width) { int shift = depth - 8; - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "vbroadcastf128 %6,%%ymm5 \n" @@ -6446,7 +6457,7 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "test $0xf,%0 \n" "jne 2f \n" "test $0xf,%1 \n" @@ -6486,7 +6497,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -6507,7 +6518,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_COPYROW_AVX512BW void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" @@ -6530,20 +6541,19 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep movsb \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc"); + asm volatile("rep movsb \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc"); } #endif // HAS_COPYROW_ERMS #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" @@ -6578,7 +6588,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" @@ -6608,7 +6618,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0), %%xmm0 \n" "movdqu 0x10(%0), %%xmm1 \n" @@ -6637,7 +6647,7 @@ static const uvec8 kShuffleAlphaShort_AVX2 = { void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" @@ -6673,7 +6683,7 @@ void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" @@ -6710,7 +6720,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" @@ -6741,38 +6751,35 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile ( - "rep stosl \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile("rep stosl \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosb \n" - : "+D"(dst), // %0 - "+c"(width_tmp) // %1 - : "a"(v8) // %2 - : "memory", "cc"); + asm volatile("rep stosb \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); } void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile ( - "rep stosl \n" - : "+D"(dst_argb), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); + asm volatile("rep stosl \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -6799,7 +6806,7 @@ void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6827,7 +6834,7 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6866,7 +6873,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6898,7 +6905,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, } void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -6922,7 +6929,7 @@ void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6961,7 +6968,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" @@ -6995,7 +7002,7 @@ void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" @@ -7024,7 +7031,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -7052,7 +7059,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7092,7 +7099,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7127,7 +7134,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, } void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -7152,7 +7159,7 @@ void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7192,7 +7199,7 @@ void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" @@ -7237,7 +7244,7 @@ void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" "pcmpeqb %%xmm6,%%xmm6 \n" @@ -7325,7 +7332,7 @@ void BlendPlaneRow_SSSE3(const uint8_t* src0, const uint8_t* alpha, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psllw $0x8,%%xmm5 \n" "mov $0x80808080,%%eax \n" @@ -7377,7 +7384,7 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* alpha, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsllw $0x8,%%ymm5,%%ymm5 \n" "mov $0x80808080,%%eax \n" @@ -7437,7 +7444,7 @@ static const vec8 kAttenuateShuffle = {6, -128, 6, -128, 6, -128, void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -7492,7 +7499,7 @@ static const lvec8 kAttenuateShuffle_AVX2 = { void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmovdqa %3,%%ymm4 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpslld $0x18,%%ymm5,%%ymm5 \n" @@ -7538,7 +7545,7 @@ void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( + asm volatile( // 4 pixel loop. LABELALIGN "1: \n" @@ -7586,7 +7593,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; - asm volatile ( + asm volatile( "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" @@ -7648,7 +7655,7 @@ void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -7710,7 +7717,7 @@ static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movdqa %2,%%xmm2 \n" "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" @@ -7771,7 +7778,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile ( + asm volatile( "movdqu (%3),%%xmm5 \n" "pshufd $0x00,%%xmm5,%%xmm2 \n" "pshufd $0x55,%%xmm5,%%xmm3 \n" @@ -7836,7 +7843,7 @@ void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm volatile ( + asm volatile( "movd %2,%%xmm2 \n" "movd %3,%%xmm3 \n" "movd %4,%%xmm4 \n" @@ -7887,7 +7894,7 @@ void ARGBShadeRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm volatile ( + asm volatile( "movd %3,%%xmm2 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n" @@ -7923,11 +7930,10 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" - // 4 pixel loop. - LABELALIGN + // 4 pixel loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" @@ -7946,12 +7952,12 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_SSE2 @@ -7961,11 +7967,10 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - // 4 pixel loop. - LABELALIGN + // 4 pixel loop. + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm1 \n" "lea 0x20(%0),%0 \n" @@ -7983,12 +7988,12 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(src_argb1), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(src_argb1), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_AVX2 @@ -7998,7 +8003,7 @@ void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 4 pixel loop. LABELALIGN "1: \n" @@ -8026,7 +8031,7 @@ void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 4 pixel loop. LABELALIGN "1: \n" @@ -8054,7 +8059,7 @@ void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 4 pixel loop. LABELALIGN "1: \n" @@ -8082,7 +8087,7 @@ void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 4 pixel loop. LABELALIGN "1: \n" @@ -8114,7 +8119,7 @@ void SobelXRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" @@ -8168,7 +8173,7 @@ void SobelYRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "pxor %%xmm5,%%xmm5 \n" @@ -8221,7 +8226,7 @@ void SobelRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -8268,7 +8273,7 @@ void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -8303,7 +8308,7 @@ void SobelXYRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -8351,7 +8356,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { - asm volatile ( + asm volatile( "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "sub $0x4,%3 \n" @@ -8431,7 +8436,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, int area, uint8_t* dst, int count) { - asm volatile ( + asm volatile( "movd %5,%%xmm5 \n" "cvtdq2ps %%xmm5,%%xmm5 \n" "rcpss %%xmm5,%%xmm4 \n" @@ -8566,7 +8571,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; - asm volatile ( + asm volatile( "movq (%3),%%xmm2 \n" "movq 0x08(%3),%%xmm7 \n" "shl $0x10,%1 \n" @@ -8651,7 +8656,7 @@ void InterpolateRow_SSSE3(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - asm volatile ( + asm volatile( "sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" @@ -8732,7 +8737,7 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { - asm volatile ( + asm volatile( "sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" @@ -8809,10 +8814,9 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile ( - "movdqu (%3),%%xmm5 \n" + asm volatile("movdqu (%3),%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -8824,11 +8828,11 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 @@ -8838,10 +8842,9 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile ( - "vbroadcastf128 (%3),%%ymm5 \n" + asm volatile("vbroadcastf128 (%3),%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -8854,11 +8857,11 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX2 @@ -8868,10 +8871,9 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile ( - "sub %1,%2 \n" + asm volatile("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" @@ -8887,13 +8889,13 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_SSE2 @@ -8903,10 +8905,9 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile ( - "sub %1,%2 \n" + asm volatile("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" @@ -8922,13 +8923,13 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_SSE2 @@ -8938,10 +8939,9 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile ( - "sub %1,%2 \n" + asm volatile("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" @@ -8960,13 +8960,13 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_yuy2), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_AVX2 @@ -8976,10 +8976,9 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile ( - "sub %1,%2 \n" + asm volatile("sub %1,%2 \n" - LABELALIGN + LABELALIGN "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" @@ -8998,13 +8997,13 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_uyvy), // %3 - "+rm"(width) // %4 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2"); + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+rm"(width) // %4 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_AVX2 @@ -9013,11 +9012,10 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile ( - "pxor %%xmm3,%%xmm3 \n" + asm volatile("pxor %%xmm3,%%xmm3 \n" - // 2 pixel loop. - LABELALIGN + // 2 pixel loop. + LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" @@ -9055,11 +9053,12 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, "lea 0x8(%1),%1 \n" "sub $0x2,%2 \n" "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(poly) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 @@ -9068,7 +9067,7 @@ void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile ( + asm volatile( "vbroadcastf128 (%3),%%ymm4 \n" "vbroadcastf128 0x10(%3),%%ymm5 \n" "vbroadcastf128 0x20(%3),%%ymm6 \n" @@ -9111,7 +9110,7 @@ void HalfFloatRow_SSE2(const uint16_t* src, float scale, int width) { scale *= kScaleBias; - asm volatile ( + asm volatile( "movd %3,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" @@ -9149,7 +9148,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, float scale, int width) { scale *= kScaleBias; - asm volatile ( + asm volatile( "vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" "sub %0,%1 \n" @@ -9179,7 +9178,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -9190,7 +9189,7 @@ void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm volatile ( + asm volatile( "vbroadcastss %3, %%ymm4 \n" "sub %0,%1 \n" @@ -9217,7 +9216,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } @@ -9225,7 +9224,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" // 16 pixel loop. LABELALIGN @@ -9256,7 +9255,7 @@ void ARGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile ( + asm volatile( // 1 pixel loop. LABELALIGN "1: \n" @@ -9289,7 +9288,7 @@ void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { uintptr_t pixel_temp; - asm volatile ( + asm volatile( // 1 pixel loop. LABELALIGN "1: \n" @@ -9322,7 +9321,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; - asm volatile ( + asm volatile( "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" @@ -9426,7 +9425,7 @@ void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "movdqa (%4),%%xmm4 \n" // 3 shuffler constants "movdqa 16(%4),%%xmm5 \n" @@ -9464,7 +9463,7 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants "vbroadcastf128 16(%4),%%ymm5 \n" @@ -9512,7 +9511,7 @@ void NV21ToYUV24Row_AVX512(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( + asm volatile( "sub %0,%1 \n" "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants "vmovdqa 32(%4),%%ymm5 \n" @@ -9551,10 +9550,9 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile ( - "movdqu %3,%%xmm5 \n" + asm volatile("movdqu %3,%%xmm5 \n" - LABELALIGN + LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -9566,20 +9564,19 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_SWAPUVROW_SSSE3 #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile ( - "vbroadcastf128 %3,%%ymm5 \n" + asm volatile("vbroadcastf128 %3,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -9592,11 +9589,11 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_uv), // %0 - "+r"(dst_vu), // %1 - "+r"(width) // %2 - : "m"(kShuffleUVToVU) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_SWAPUVROW_AVX2 @@ -9606,7 +9603,7 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, int src_stride_v, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" @@ -9652,7 +9649,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_v, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" @@ -9694,7 +9691,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, } void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { - asm volatile ( + asm volatile( "pxor %%xmm1,%%xmm1 \n" LABELALIGN diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 734d7ee29..3613b0adc 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -2039,7 +2039,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, int width, const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile ( + asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2101,7 +2101,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba, int width, const struct RgbConstants* rgbconstants) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; - asm volatile ( + asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants @@ -2165,7 +2165,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile ( + asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 50d5ba6a0..10546a90d 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -2807,7 +2807,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -2866,7 +2866,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants @@ -2924,7 +2924,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; - asm volatile ( + asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants diff --git a/source/row_neon.cc b/source/row_neon.cc index cfbb364d1..8c51b6bb3 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV444 YUVTORGB @@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV444 YUVTORGB RGBTORGB8 @@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV444 YUVTORGB RGBTORGB8 @@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 @@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB @@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 @@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "vmov.u8 d7, #0x0f \n" // vbic bits to clear @@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV400 YUVTORGB @@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d23, #255 \n" "1: \n" "vld1.8 {d20}, [%0]! \n" @@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 @@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV21 YUVTORGB RGBTORGB8 @@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 @@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV21 YUVTORGB RGBTORGB8 @@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 @@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUY2 YUVTORGB RGBTORGB8 @@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READUYVY YUVTORGB RGBTORGB8 @@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop @@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "subs %2, %2, #16 \n" // 16 processed per loop @@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {d0, d1}, [%0], %4 \n" "subs %3, %3, #16 \n" @@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "pld [%0, #1792] \n" @@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV @@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, #endif void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q14}, [%0]! \n" // Load lower bits. "vld1.8 {q9}, [%0]! \n" // Load upper bits row @@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q1}, [%1]! \n" // load V @@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB @@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G @@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB @@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G @@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB @@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 q3, #255 \n" // load A(255) "1: \n" "vld1.8 {q2}, [%0]! \n" // load R @@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 10 - depth; - asm volatile ( + asm volatile( "vmov.u32 q14, #1023 \n" "vdup.32 q15, %5 \n" "1: \n" @@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, uint8_t* dst_ar30, int /* depth */, int width) { - asm volatile ( + asm volatile( "vmov.u32 q14, #1023 \n" "1: \n" "vld1.16 {d4}, [%2]! \n" // B @@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile ( + asm volatile( "vdup.u16 q15, %6 \n" "vdup.u16 q14, %7 \n" @@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile ( + asm volatile( "vmov.u8 q3, #0xff \n" // A (0xffff) "vdup.u16 q15, %5 \n" @@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 8 - depth; - asm volatile ( + asm volatile( "vdup.16 q15, %6 \n" "1: \n" @@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 8 - depth; - asm volatile ( + asm volatile( "vdup.16 q15, %5 \n" "vmov.u8 d6, #0xff \n" // A (0xff) @@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop @@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile ( + asm volatile( "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop @@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { // ARGBSetRow writes 'width' pixels using an 32 bit value repeated. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile ( + asm volatile( "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #4 \n" // 4 pixels per loop @@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { } void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "add %0, %0, %2 \n" "sub %0, %0, #32 \n" // 32 bytes per loop @@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { } void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "mov r12, #-16 \n" "add %0, %0, %2, lsl #1 \n" @@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "mov r12, #-16 \n" "add %0, %0, %3, lsl #1 \n" @@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, } void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "add %0, %0, %2, lsl #2 \n" "sub %0, #32 \n" @@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { src_rgb24 += width * 3 - 24; - asm volatile ( + asm volatile( "1: \n" "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 "subs %2, #8 \n" // 8 pixels per loop. @@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d4, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. @@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d4, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. @@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile ( + asm volatile( "vmov.u8 d0, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. @@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile ( + asm volatile( "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. @@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. @@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. @@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" @@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. @@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. @@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. @@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. @@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. @@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. @@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile ( + asm volatile( "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" "vld1.8 {q0}, [%0]! \n" // load 4 pixels. @@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld1.8 {d1}, [%1]! \n" // load 8 Us @@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld1.8 {d0}, [%1]! \n" // load 8 Us @@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width) { - asm volatile ( + asm volatile( "vdup.32 d7, %2 \n" // dither4 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. @@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. @@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { - asm volatile ( + asm volatile( "vmov.u8 d7, #0x0f \n" // bits to clear with // vbic. "1: \n" @@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels @@ -1839,7 +1839,7 @@ static void ARGBToUV444MatrixRow_NEON( uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) { - asm volatile ( + asm volatile( "vld1.8 {d0}, [%4] \n" // load rgbuvconstants "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient @@ -2367,7 +2367,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient @@ -2433,7 +2433,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient @@ -2551,7 +2551,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient @@ -2577,7 +2577,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient @@ -2603,7 +2603,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient @@ -2629,7 +2629,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" @@ -2652,7 +2652,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile ( + asm volatile( "vld1.8 {q4}, [%3] \n" // shuffler "1: \n" @@ -2678,7 +2678,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" @@ -2704,7 +2704,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vld1.8 {d8}, [%3] \n" // shuffler "1: \n" @@ -2757,7 +2757,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "vld1.8 {d0}, [%3] \n" // load rgbconstants "vdup.u8 d20, d0[0] \n" "vdup.u8 d21, d0[1] \n" @@ -2807,7 +2807,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "vld1.8 {d0}, [%3] \n" // load rgbconstants "vdup.u8 d20, d0[0] \n" "vdup.u8 d21, d0[1] \n" @@ -2851,7 +2851,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "vld1.8 {d0}, [%3] \n" // load rgbconstants "vdup.u8 d20, d0[0] \n" "vdup.u8 d21, d0[1] \n" @@ -2903,7 +2903,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int dst_width, int source_y_fraction) { int y1_fraction = source_y_fraction; - asm volatile ( + asm volatile( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" @@ -2965,7 +2965,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; - asm volatile ( + asm volatile( "cmp %4, #0 \n" "beq 100f \n" "cmp %4, #128 \n" @@ -3020,7 +3020,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "subs %3, #8 \n" "blt 89f \n" // Blend 8 pixels. @@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u16 q15, #0x00ff \n" // 255 for rounding up // Attenuate 8 pixels. @@ -3108,7 +3108,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm volatile ( + asm volatile( "vdup.u16 q8, %2 \n" "vshr.u16 q8, q8, #1 \n" // scale >>= 1 "vdup.u16 q9, %3 \n" // interval multiply. @@ -3150,7 +3150,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm volatile ( + asm volatile( "vdup.u32 q0, %3 \n" // duplicate scale value. "vzip.u8 d0, d1 \n" // d0 aarrggbb. "vshr.u16 q0, q0, #1 \n" // scale / 2. @@ -3184,7 +3184,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // Similar to ARGBToYJ but stores ARGB. // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient @@ -3211,7 +3211,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d20, #17 \n" // BB coefficient "vmov.u8 d21, #68 \n" // BG coefficient "vmov.u8 d22, #35 \n" // BR coefficient @@ -3252,7 +3252,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile ( + asm volatile( "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. @@ -3311,7 +3311,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 8 pixel loop. "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. @@ -3340,7 +3340,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 8 pixel loop. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -3363,7 +3363,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 8 pixel loop. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -3390,7 +3390,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -3415,7 +3415,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( // 16 pixel loop. "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. @@ -3441,7 +3441,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -3468,7 +3468,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" @@ -3506,7 +3506,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" @@ -3543,7 +3543,7 @@ void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts @@ -3564,11 +3564,11 @@ void HalfFloatRow_NEON(const uint16_t* src, "vqshrn.u32 d1, q9, #13 \n" "vqshrn.u32 d2, q10, #13 \n" "vqshrn.u32 d3, q11, #13 \n" - "vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16 + "vst1.16 {q0, q1}, [%1]! \n" // store 16 fp16 "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"); } @@ -3577,7 +3577,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 bytes @@ -3606,7 +3606,7 @@ void GaussCol_NEON(const uint16_t* src0, const uint16_t* src4, uint32_t* dst, int width) { - asm volatile ( + asm volatile( "vmov.u16 d6, #4 \n" // constant 4 "vmov.u16 d7, #6 \n" // constant 6 @@ -3643,7 +3643,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src1 = src + 1; const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; - asm volatile ( + asm volatile( "vmov.u32 q10, #4 \n" // constant 4 "vmov.u32 q11, #6 \n" // constant 6 @@ -3681,7 +3681,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values @@ -3705,7 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. @@ -3736,7 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) { - asm volatile ( + asm volatile( "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. @@ -3766,7 +3766,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Copy row of AYUV Y's into Y. // Similar to ARGBExtractAlphaRow_NEON void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels @@ -3782,7 +3782,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile ( + asm volatile( "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d1, d3}, [%0]! \n" @@ -3805,7 +3805,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, int width) { const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 U values "vld1.8 {q1}, [%2]! \n" // load 16 V values @@ -3836,7 +3836,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, int depth, int width) { int shift = depth - 16; // Negative for right shift. - asm volatile ( + asm volatile( "vdup.16 q2, %4 \n" "1: \n" "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV @@ -3860,7 +3860,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; - asm volatile ( + asm volatile( "vdup.16 q2, %4 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U @@ -3882,7 +3882,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "vdup.16 q2, %3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" @@ -3904,7 +3904,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "vdup.16 d8, %3 \n" "1: \n" "vld1.16 {q2, q3}, [%0]! \n" @@ -3936,7 +3936,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, int scale, int width) { int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr - asm volatile ( + asm volatile( "vdup.16 q2, %3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 55f686766..a8ba41357 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -239,7 +239,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ "1: \n" READYUV444 I4XXTORGB @@ -263,7 +263,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV444 I4XXTORGB RGBTORGB8 @@ -290,12 +290,13 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; - asm volatile (YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV210 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" + asm volatile( + YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" READYUV210 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -318,12 +319,13 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; - asm volatile (YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" READYUV410 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" + asm volatile( + YUVTORGB_SETUP + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" READYUV410 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -345,13 +347,13 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; - asm volatile ( + asm volatile( YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV212 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" READYUV212 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -369,12 +371,13 @@ void I210ToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV210 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV210 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -391,12 +394,13 @@ void I410ToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV410 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + asm volatile( + YUVTORGB_SETUP + "movi v19.8b, #255 \n" + "1: \n" READYUV410 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -415,13 +419,13 @@ void I212ToARGBRow_NEON(const uint16_t* src_y, int width) { const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; - asm volatile ( + asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "1: \n" READYUV212 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "movi v19.8b, #255 \n" + "1: \n" READYUV212 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -438,7 +442,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ "1: \n" READYUV422 I4XXTORGB @@ -468,13 +472,13 @@ void P210ToARGBRow_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kIndices]] \n" - "1: \n" // + "movi v19.8b, #255 \n" + "ldr q2, [%[kIndices]] \n" + "1: \n" // READYUVP210 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -497,13 +501,13 @@ void P410ToARGBRow_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" - "ldr q2, [%[kIndices]] \n" - "1: \n" // + "movi v19.8b, #255 \n" + "ldr q2, [%[kIndices]] \n" + "1: \n" // READYUVP410 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -523,12 +527,12 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP210 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" READYUVP210 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -549,12 +553,12 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" READYUVP410 NVTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" READYUVP410 NVTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -575,13 +579,13 @@ void I422ToAR30Row_NEON(const uint8_t* src_y, const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; - asm volatile ( + asm volatile( YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" READYUV422 I4XXTORGB - "subs %w[width], %w[width], #8 \n" STOREAR30 - "b.gt 1b \n" + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" READYUV422 I4XXTORGB + "subs %w[width], %w[width], #8 \n" STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -600,7 +604,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 @@ -626,13 +630,14 @@ void I410AlphaToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP - "1: \n" - "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 - "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" - "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + asm volatile( + YUVTORGB_SETUP + "1: \n" + "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV410 + "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" + "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[dst_argb]], #32 \n" + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -651,7 +656,8 @@ void I210AlphaToARGBRow_NEON(const uint16_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP + asm volatile( + YUVTORGB_SETUP "1: \n" "ld1 {v19.16b}, [%[src_a]], #16 \n" READYUV210 "uqshrn v19.8b, v19.8h, #2 \n" NVTORGB RGBTORGB8 @@ -676,7 +682,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 @@ -701,7 +707,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "movi v15.8b, #255 \n" /* A */ "1: \n" READYUV422 I4XXTORGB @@ -725,7 +731,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 I4XXTORGB RGBTORGB8 @@ -767,7 +773,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 I4XXTORGB RGBTORGB8_TOP @@ -807,14 +813,15 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { - asm volatile (YUVTORGB_SETUP - "movi v19.8h, #0x80, lsl #8 \n" + asm volatile( + YUVTORGB_SETUP + "movi v19.8h, #0x80, lsl #8 \n" "1: \n" // READYUV422 I4XXTORGB RGBTORGB8_TOP - "subs %w[width], %w[width], #8 \n" // + "subs %w[width], %w[width], #8 \n" // ARGBTOARGB1555_FROM_TOP - "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555. - "b.gt 1b \n" + "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555. + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -837,7 +844,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 I4XXTORGB RGBTORGB8 @@ -861,7 +868,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "movi v1.16b, #128 \n" "movi v19.8b, #255 \n" @@ -884,7 +891,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, #if defined(LIBYUV_USE_ST4) void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v23.8b, #255 \n" "1: \n" "ld1 {v20.8b}, [%0], #8 \n" @@ -902,7 +909,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { } #else void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v20.8b, #255 \n" "1: \n" "ldr d16, [%0], #8 \n" @@ -927,7 +934,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" @@ -950,7 +957,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" @@ -973,7 +980,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 NVTORGB RGBTORGB8 @@ -995,7 +1002,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 NVTORGB RGBTORGB8 @@ -1017,7 +1024,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 NVTORGB @@ -1041,14 +1048,14 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" + "movi v19.8b, #255 \n" "ldr q2, [%[kNV21InterleavedTable]] \n" - "1: \n" READYUY2 NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READYUY2 NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] @@ -1062,14 +1069,14 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile ( + asm volatile( YUVTORGB_SETUP - "movi v19.8b, #255 \n" + "movi v19.8b, #255 \n" "ldr q2, [%[kNV12InterleavedTable]] \n" - "1: \n" READUYVY NVTORGB RGBTORGB8 - "subs %w[width], %w[width], #8 \n" + "1: \n" READUYVY NVTORGB RGBTORGB8 + "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] @@ -1084,7 +1091,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %w3, %w3, #16 \n" // 16 processed per loop @@ -1109,7 +1116,7 @@ void DetileRow_NEON(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -1129,7 +1136,7 @@ void DetileRow_16_NEON(const uint16_t* src, ptrdiff_t src_tile_stride, uint16_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -1150,7 +1157,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.8b,v1.8b}, [%0], %4 \n" "subs %w3, %w3, #16 \n" @@ -1175,7 +1182,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys "prfm pldl1keep, [%0, 1792] \n" @@ -1201,7 +1208,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs @@ -1226,7 +1233,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y, // Unpack MT2T into tiled P010 64 pixels at a time. See // tinyurl.com/mtk-10bit-video-format for format documentation. void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v7.16b}, [%0], #16 \n" "ld1 {v0.16b-v3.16b}, [%0], #64 \n" @@ -1267,7 +1274,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V @@ -1291,7 +1298,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; - asm volatile ( + asm volatile( "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U @@ -1316,7 +1323,7 @@ void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V @@ -1342,7 +1349,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; - asm volatile ( + asm volatile( "dup v4.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U @@ -1371,7 +1378,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB "subs %w4, %w4, #16 \n" // 16 processed per loop @@ -1396,7 +1403,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_rgb, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G @@ -1424,7 +1431,7 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_b, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w5, %w5, #16 \n" // 16 processed per loop @@ -1453,7 +1460,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G @@ -1484,7 +1491,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_a, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G @@ -1524,7 +1531,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_g, uint8_t* dst_b, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w4, %w4, #16 \n" // 16 processed per loop @@ -1549,7 +1556,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_b, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v3.16b, #255 \n" // load A(255) "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load R @@ -1578,7 +1585,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r, int depth, int width) { int shift = 10 - depth; - asm volatile ( + asm volatile( "movi v30.16b, #255 \n" "ushr v30.4s, v30.4s, #22 \n" // 1023 "dup v31.4s, %w5 \n" @@ -1619,24 +1626,24 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r, // Neon has no "shift left and accumulate/orr", so use a multiply-add to // perform the shift instead. int limit = 1023; - asm volatile ( - "dup v5.8h, %w[limit] \n" - "movi v6.8h, #16 \n" // 1 << 4 - "movi v7.8h, #4, lsl #8 \n" // 1 << 10 - "1: \n" - "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr - "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg - "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb - "umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr - "umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg - "movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000 - "umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb - "mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000 - "mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb - "usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg - "subs %w4, %w4, #8 \n" - "st2 {v3.8h, v4.8h}, [%3], #32 \n" - "b.gt 1b \n" + asm volatile( + "dup v5.8h, %w[limit] \n" + "movi v6.8h, #16 \n" // 1 << 4 + "movi v7.8h, #4, lsl #8 \n" // 1 << 10 + "1: \n" + "ldr q0, [%0], #16 \n" // xxxxxxRrrrrrrrrr + "ldr q1, [%1], #16 \n" // xxxxxxGggggggggg + "ldr q2, [%2], #16 \n" // xxxxxxBbbbbbbbbb + "umin v0.8h, v0.8h, v5.8h \n" // 000000Rrrrrrrrrr + "umin v1.8h, v1.8h, v5.8h \n" // 000000Gggggggggg + "movi v4.8h, #0xc0, lsl #8 \n" // 1100000000000000 + "umin v3.8h, v2.8h, v5.8h \n" // 000000Bbbbbbbbbb + "mla v4.8h, v0.8h, v6.8h \n" // 11Rrrrrrrrrr0000 + "mla v3.8h, v1.8h, v7.8h \n" // ggggggBbbbbbbbbb + "usra v4.8h, v1.8h, #6 \n" // 11RrrrrrrrrrGggg + "subs %w4, %w4, #8 \n" + "st2 {v3.8h, v4.8h}, [%3], #32 \n" + "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 @@ -1655,7 +1662,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile ( + asm volatile( "dup v30.8h, %w7 \n" "dup v31.8h, %w6 \n" @@ -1698,7 +1705,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; - asm volatile ( + asm volatile( "movi v3.16b, #0xff \n" // A (0xffff) "dup v30.8h, %w6 \n" @@ -1739,7 +1746,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r, int width) { // Shift is 8 - depth, +8 so the result is in the top half of each lane. int shift = 16 - depth; - asm volatile ( + asm volatile( "dup v31.8h, %w6 \n" "1: \n" "ldr q0, [%0], #16 \n" // B @@ -1777,7 +1784,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, int width) { // Shift is 8 - depth, +8 so the result is in the top half of each lane. int shift = 16 - depth; - asm volatile ( + asm volatile( "dup v31.8h, %w5 \n" "movi v3.16b, #0xff \n" // A (0xff) "1: \n" @@ -1806,7 +1813,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r, // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "ldp q0, q1, [%0], #32 \n" "prfm pldl1keep, [%0, 448] \n" @@ -1823,7 +1830,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { - asm volatile ( + asm volatile( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" "subs %w1, %w1, #16 \n" // 16 bytes per loop @@ -1836,7 +1843,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { } void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { - asm volatile ( + asm volatile( "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" "subs %w1, %w1, #4 \n" // 4 ints per loop @@ -1853,7 +1860,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "ld1 {v3.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw \n" @@ -1878,7 +1885,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" @@ -1902,7 +1909,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "ld1 {v4.16b}, [%4] \n" // shuffler "add %0, %0, %w3, sxtw #1 \n" @@ -1931,7 +1938,7 @@ static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // Start at end of source row. "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #2 \n" @@ -1954,7 +1961,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { - asm volatile ( + asm volatile( "ld1 {v3.16b}, [%4] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. "add %0, %0, %w2, sxtw \n" @@ -1979,7 +1986,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v4.8b, #255 \n" // Alpha "1: \n" "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of @@ -1997,7 +2004,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b @@ -2016,7 +2023,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { } void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { - asm volatile ( + asm volatile( "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b @@ -2035,7 +2042,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { - asm volatile ( + asm volatile( "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2067,14 +2074,14 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( - "movi v3.16b, #255 \n" // Alpha - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB - "add %1, %1, #64 \n" - "b.gt 1b \n" + "movi v3.16b, #255 \n" // Alpha + "1: \n" + "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB + "add %1, %1, #64 \n" + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2115,14 +2122,14 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels - "prfm pldl1keep, [%0, 448] \n" - "subs %w2, %w2, #16 \n" // 16 processed per loop + "1: \n" + "ldp q0, q4, [%0], #32 \n" // load 16 ARGB1555 pixels + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop ARGB1555TOARGB - "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB - "add %1, %1, #64 \n" - "b.gt 1b \n" + "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%1] \n" // store 16 ARGB + "add %1, %1, #64 \n" + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -2150,7 +2157,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2179,29 +2186,29 @@ static void ABCDToAR30Row_NEON(const uint8_t* src_abcd, uint8_t* dst_ar30, int width, const uint8_t* indices) { - asm volatile ( - "movi v2.4s, #0xf, msl 16 \n" // 0xfffff - "ldr q3, [%[kAR30Row_BoxShifts]] \n" - "ldp q4, q5, [%[indices]] \n" + asm volatile( + "movi v2.4s, #0xf, msl 16 \n" // 0xfffff + "ldr q3, [%[kAR30Row_BoxShifts]] \n" + "ldp q4, q5, [%[indices]] \n" "1: \n" - "ldp q0, q20, [%[src]], #32 \n" - "subs %w[width], %w[width], #8 \n" - "tbl v1.16b, {v0.16b}, v5.16b \n" - "tbl v21.16b, {v20.16b}, v5.16b \n" - "tbl v0.16b, {v0.16b}, v4.16b \n" - "tbl v20.16b, {v20.16b}, v4.16b \n" - "ushl v0.8h, v0.8h, v3.8h \n" - "ushl v20.8h, v20.8h, v3.8h \n" - "ushl v1.8h, v1.8h, v3.8h \n" - "ushl v21.8h, v21.8h, v3.8h \n" - "ushr v0.4s, v0.4s, #6 \n" - "ushr v20.4s, v20.4s, #6 \n" - "shl v1.4s, v1.4s, #14 \n" - "shl v21.4s, v21.4s, #14 \n" - "bif v0.16b, v1.16b, v2.16b \n" - "bif v20.16b, v21.16b, v2.16b \n" - "stp q0, q20, [%[dst]], #32 \n" - "b.gt 1b \n" + "ldp q0, q20, [%[src]], #32 \n" + "subs %w[width], %w[width], #8 \n" + "tbl v1.16b, {v0.16b}, v5.16b \n" + "tbl v21.16b, {v20.16b}, v5.16b \n" + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v20.16b, {v20.16b}, v4.16b \n" + "ushl v0.8h, v0.8h, v3.8h \n" + "ushl v20.8h, v20.8h, v3.8h \n" + "ushl v1.8h, v1.8h, v3.8h \n" + "ushl v21.8h, v21.8h, v3.8h \n" + "ushr v0.4s, v0.4s, #6 \n" + "ushr v20.4s, v20.4s, #6 \n" + "shl v1.4s, v1.4s, #14 \n" + "shl v21.4s, v21.4s, #14 \n" + "bif v0.16b, v1.16b, v2.16b \n" + "bif v20.16b, v21.16b, v2.16b \n" + "stp q0, q20, [%[dst]], #32 \n" + "b.gt 1b \n" : [src] "+r"(src_abcd), // %[src] [dst] "+r"(dst_ar30), // %[dst] [width] "+r"(width) // %[width] @@ -2221,7 +2228,7 @@ void ARGBToAR30Row_NEON(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w2, %w2, #16 \n" // 16 pixels per loop. @@ -2237,7 +2244,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2255,7 +2262,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -2271,7 +2278,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %w2, %w2, #16 \n" // 16 processed per loop. @@ -2290,7 +2297,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. @@ -2311,7 +2318,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. @@ -2334,7 +2341,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_v, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. @@ -2362,7 +2369,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_v, int width) { const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. @@ -2389,7 +2396,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_uv, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. @@ -2412,7 +2419,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile ( + asm volatile( "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. @@ -2434,7 +2441,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "subs %w4, %w4, #16 \n" // 16 pixels @@ -2458,7 +2465,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile ( + asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "mov v3.8b, v2.8b \n" @@ -2480,7 +2487,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels @@ -2499,7 +2506,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width) { - asm volatile ( + asm volatile( "dup v1.4s, %w3 \n" // dither4 "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB @@ -2537,7 +2544,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels @@ -2556,7 +2563,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile ( + asm volatile( "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "mov v1.16b, v0.16b \n" @@ -2579,7 +2586,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile ( + asm volatile( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels @@ -2602,7 +2609,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { - asm volatile ( + asm volatile( "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -2627,7 +2634,7 @@ static const uvec8 kShuffleARGBToAB64[2] = { void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { - asm volatile ( + asm volatile( "ldp q6, q7, [%3] \n" // 2 shufflers "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 pixels @@ -2653,7 +2660,7 @@ static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels @@ -2677,7 +2684,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels @@ -2698,7 +2705,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "prfm pldl1keep, [%0, 448] \n" @@ -2772,7 +2779,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( uint8_t* dst_v, int width, const struct RgbUVConstantsI8* rgbuvconstants) { - asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" + asm("ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" "movi v29.16b, #0x80 \n" // 128.5 "1: \n" "ldp q0, q1, [%[src]], #32 \n" @@ -3288,30 +3295,30 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile( RGBTOUV_SETUP_REG - "1: \n" - "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. + "1: \n" + "ldp q0, q4, [%0], #32 \n" // load 16 RGB565 pixels. RGB565TOARGB - "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ldp q0, q4, [%1], #32 \n" // load 16 RGB565 pixels. + "ldp q0, q4, [%1], #32 \n" // load 16 RGB565 pixels. RGB565TOARGB - "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_rgb565_1), // %1 "+r"(dst_u), // %2 @@ -3332,30 +3339,30 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile( RGBTOUV_SETUP_REG - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. + "1: \n" + "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. RGB555TOARGB - "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ldp q0, q3, [%1], #32 \n" // load 16 ARGB1555 pixels. + "ldp q0, q3, [%1], #32 \n" // load 16 ARGB1555 pixels. RGB555TOARGB - "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 @@ -3376,30 +3383,30 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile( RGBTOUV_SETUP_REG // sets v20-v25 - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. + "1: \n" + "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. ARGB4444TORGB - "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%0, 448] \n" - "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" + "uaddlp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "ldp q0, q3, [%1], #32 \n" // load 16 ARGB4444 pixels. + "ldp q0, q3, [%1], #32 \n" // load 16 ARGB4444 pixels. ARGB4444TORGB - "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" - "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. - "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uadalp v16.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" + "uadalp v17.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uadalp v18.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "urshr v0.8h, v16.8h, #1 \n" // 2x average - "urshr v1.8h, v17.8h, #1 \n" - "urshr v2.8h, v18.8h, #1 \n" + "urshr v0.8h, v16.8h, #1 \n" // 2x average + "urshr v1.8h, v17.8h, #1 \n" + "urshr v2.8h, v18.8h, #1 \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) - "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. - "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. - "b.gt 1b \n" + "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. + "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 @@ -3448,26 +3455,26 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( - "movi v4.16b, #25 \n" // B * 0.1016 coefficient - "movi v5.16b, #129 \n" // G * 0.5078 coefficient - "movi v6.16b, #66 \n" // R * 0.2578 coefficient - "movi v7.16b, #16 \n" // Add 16 constant - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. + "movi v4.16b, #25 \n" // B * 0.1016 coefficient + "movi v5.16b, #129 \n" // G * 0.5078 coefficient + "movi v6.16b, #66 \n" // R * 0.2578 coefficient + "movi v7.16b, #16 \n" // Add 16 constant + "1: \n" + "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. RGB555TOARGB - "umull v16.8h, v0.8b, v4.8b \n" // B - "umull2 v17.8h, v0.16b, v4.16b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v5.8b \n" // G - "umlal2 v17.8h, v1.16b, v5.16b \n" // G - "umlal v16.8h, v2.8b, v6.8b \n" // R - "umlal2 v17.8h, v2.16b, v6.16b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.16b, v0.16b, v7.16b \n" - "str q0, [%1], #16 \n" // store pixels Y. - "b.gt 1b \n" + "umull v16.8h, v0.8b, v4.8b \n" // B + "umull2 v17.8h, v0.16b, v4.16b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v16.8h, v1.8b, v5.8b \n" // G + "umlal2 v17.8h, v1.16b, v5.16b \n" // G + "umlal v16.8h, v2.8b, v6.8b \n" // R + "umlal2 v17.8h, v2.16b, v6.16b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.16b, v0.16b, v7.16b \n" + "str q0, [%1], #16 \n" // store pixels Y. + "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3480,26 +3487,26 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( - "movi v24.16b, #25 \n" // B * 0.1016 coefficient - "movi v25.16b, #129 \n" // G * 0.5078 coefficient - "movi v26.16b, #66 \n" // R * 0.2578 coefficient - "movi v27.16b, #16 \n" // Add 16 constant - "1: \n" - "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. - "subs %w2, %w2, #16 \n" // 16 processed per loop. + "movi v24.16b, #25 \n" // B * 0.1016 coefficient + "movi v25.16b, #129 \n" // G * 0.5078 coefficient + "movi v26.16b, #66 \n" // R * 0.2578 coefficient + "movi v27.16b, #16 \n" // Add 16 constant + "1: \n" + "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. + "subs %w2, %w2, #16 \n" // 16 processed per loop. ARGB4444TORGB - "umull v16.8h, v0.8b, v24.8b \n" // B - "umull2 v17.8h, v0.16b, v24.16b \n" // B - "prfm pldl1keep, [%0, 448] \n" - "umlal v16.8h, v1.8b, v25.8b \n" // G - "umlal2 v17.8h, v1.16b, v25.16b \n" // G - "umlal v16.8h, v2.8b, v26.8b \n" // R - "umlal2 v17.8h, v2.16b, v26.16b \n" // R - "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y - "uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y - "uqadd v0.16b, v0.16b, v27.16b \n" - "str q0, [%1], #16 \n" // store 8 pixels Y. - "b.gt 1b \n" + "umull v16.8h, v0.8b, v24.8b \n" // B + "umull2 v17.8h, v0.16b, v24.16b \n" // B + "prfm pldl1keep, [%0, 448] \n" + "umlal v16.8h, v1.8b, v25.8b \n" // G + "umlal2 v17.8h, v1.16b, v25.16b \n" // G + "umlal v16.8h, v2.8b, v26.8b \n" // R + "umlal2 v17.8h, v2.16b, v26.16b \n" // R + "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y + "uqrshrn2 v0.16b, v17.8h, #8 \n" // 16 bit to 8 bit Y + "uqadd v0.16b, v0.16b, v27.16b \n" + "str q0, [%1], #16 \n" // store 8 pixels Y. + "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -3517,7 +3524,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" @@ -3551,7 +3558,7 @@ static void ARGBToYMatrixRow_NEON_DotProd( uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "ldr d0, [%3] \n" // load rgbconstants "dup v16.4s, v0.s[0] \n" "dup v17.8h, v0.h[2] \n" @@ -3653,7 +3660,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "ldr d0, [%3] \n" // load rgbconstants "dup v6.16b, v0.b[0] \n" "dup v7.16b, v0.b[1] \n" @@ -3725,7 +3732,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct RgbConstants* rgbconstants) { - asm volatile ( + asm volatile( "ldr d0, [%3] \n" // load rgbconstants "dup v5.16b, v0.b[0] \n" "dup v6.16b, v0.b[1] \n" @@ -3777,7 +3784,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile ( + asm volatile( "cmp %w4, #0 \n" "b.eq 100f \n" "cmp %w4, #128 \n" @@ -3843,7 +3850,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; - asm volatile ( + asm volatile( "cmp %w4, #0 \n" "b.eq 100f \n" "cmp %w4, #128 \n" @@ -3915,7 +3922,7 @@ void InterpolateRow_16To8_NEON(uint8_t* dst_ptr, const uint16_t* src_ptr1 = src_ptr + src_stride; int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr - asm volatile ( + asm volatile( "dup v6.8h, %w6 \n" "cmp %w4, #0 \n" "b.eq 100f \n" @@ -3983,7 +3990,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "subs %w3, %w3, #8 \n" "b.lt 89f \n" // Blend 8 pixels. @@ -4054,7 +4061,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v7.8h, #0x00ff \n" // 255 for rounding up // Attenuate 8 pixels. @@ -4084,7 +4091,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int interval_size, int interval_offset, int width) { - asm volatile ( + asm volatile( "dup v4.8h, %w2 \n" "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 "dup v5.8h, %w3 \n" // interval multiply. @@ -4127,30 +4134,30 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { - asm volatile ( - "dup v0.4s, %w3 \n" // duplicate scale value. - "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb. - "ushr v0.8h, v0.8h, #1 \n" // scale / 2. + asm volatile( + "dup v0.4s, %w3 \n" // duplicate scale value. + "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb. + "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "uxtl v4.8h, v4.8b \n" - "prfm pldl1keep, [%0, 448] \n" - "uxtl v5.8h, v5.8b \n" - "uxtl v6.8h, v6.8b \n" - "uxtl v7.8h, v7.8b \n" - "sqrdmulh v4.8h, v4.8h, v0.8h \n" // argb * scale * 2 - "sqrdmulh v5.8h, v5.8h, v0.8h \n" - "sqrdmulh v6.8h, v6.8h, v0.8h \n" - "sqrdmulh v7.8h, v7.8h, v0.8h \n" - "uqxtn v4.8b, v4.8h \n" - "uqxtn v5.8b, v5.8h \n" - "uqxtn v6.8b, v6.8h \n" - "uqxtn v7.8b, v7.8h \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uxtl v4.8h, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" + "uxtl v5.8h, v5.8b \n" + "uxtl v6.8h, v6.8b \n" + "uxtl v7.8h, v7.8b \n" + "sqrdmulh v4.8h, v4.8h, v0.8h \n" // argb * scale * 2 + "sqrdmulh v5.8h, v5.8h, v0.8h \n" + "sqrdmulh v6.8h, v6.8h, v0.8h \n" + "sqrdmulh v7.8h, v7.8h, v0.8h \n" + "uqxtn v4.8b, v4.8h \n" + "uqxtn v5.8b, v5.8h \n" + "uqxtn v6.8b, v6.8h \n" + "uqxtn v7.8b, v7.8h \n" "st1 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -4162,7 +4169,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // Similar to ARGBToYJ but stores ARGB. // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v24.8b, #29 \n" // B * 0.1140 coefficient "movi v25.8b, #150 \n" // G * 0.5870 coefficient "movi v26.8b, #77 \n" // R * 0.2990 coefficient @@ -4193,22 +4200,22 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "ld1r {v24.4s}, [%[coeffs]] \n" - "ldr q25, [%[indices]] \n" - "1: \n" - "ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB - "movi v0.4s, #0 \n" - "movi v2.4s, #0 \n" - "subs %w[width], %w[width], #8 \n" // 8 processed per loop - "udot v0.4s, v1.16b, v24.16b \n" - "udot v2.4s, v3.16b, v24.16b \n" - "prfm pldl1keep, [%[src], 448] \n" - "uqrshrn v0.8b, v0.8h, #8 \n" - "uqrshrn v2.8b, v2.8h, #8 \n" - "tbl v0.16b, {v0.16b, v1.16b}, v25.16b \n" // merge in alpha - "tbl v1.16b, {v2.16b, v3.16b}, v25.16b \n" - "stp q0, q1, [%[dst]], #32 \n" // store 8 pixels - "b.gt 1b \n" + "ld1r {v24.4s}, [%[coeffs]] \n" + "ldr q25, [%[indices]] \n" + "1: \n" + "ldp q1, q3, [%[src]], #32 \n" // load 8 ARGB + "movi v0.4s, #0 \n" + "movi v2.4s, #0 \n" + "subs %w[width], %w[width], #8 \n" // 8 processed per loop + "udot v0.4s, v1.16b, v24.16b \n" + "udot v2.4s, v3.16b, v24.16b \n" + "prfm pldl1keep, [%[src], 448] \n" + "uqrshrn v0.8b, v0.8h, #8 \n" + "uqrshrn v2.8b, v2.8h, #8 \n" + "tbl v0.16b, {v0.16b, v1.16b}, v25.16b \n" // merge in alpha + "tbl v1.16b, {v2.16b, v3.16b}, v25.16b \n" + "stp q0, q1, [%[dst]], #32 \n" // store 8 pixels + "b.gt 1b \n" : [src] "+r"(src_argb), // %[src] [dst] "+r"(dst_argb), // %[dst] [width] "+r"(width) // %[width] @@ -4223,7 +4230,7 @@ void ARGBGrayRow_NEON_DotProd(const uint8_t* src_argb, // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v20.8b, #17 \n" // BB coefficient "movi v21.8b, #68 \n" // BG coefficient "movi v22.8b, #35 \n" // BR coefficient @@ -4265,32 +4272,32 @@ static const uvec8 kARGBSepiaRowAlphaIndices = {3, 7, 11, 15, 19, 23, 27, 31}; void ARGBSepiaRow_NEON_DotProd(uint8_t* dst_argb, int width) { asm volatile( "ld3r {v20.4s, v21.4s, v22.4s}, [%[coeffs]] \n" - "ldr d23, [%[indices]] \n" - "1: \n" - "ldp q0, q1, [%[dst]] \n" - "movi v2.4s, #0 \n" - "movi v3.4s, #0 \n" - "movi v4.4s, #0 \n" - "movi v5.4s, #0 \n" - "movi v6.4s, #0 \n" - "movi v7.4s, #0 \n" - "udot v2.4s, v0.16b, v20.16b \n" - "udot v3.4s, v1.16b, v20.16b \n" - "udot v4.4s, v0.16b, v21.16b \n" - "udot v5.4s, v1.16b, v21.16b \n" - "udot v6.4s, v0.16b, v22.16b \n" - "udot v7.4s, v1.16b, v22.16b \n" - "subs %w1, %w1, #8 \n" - "prfm pldl1keep, [%[dst], 448] \n" - "uzp1 v6.8h, v6.8h, v7.8h \n" - "uzp1 v5.8h, v4.8h, v5.8h \n" - "uzp1 v4.8h, v2.8h, v3.8h \n" + "ldr d23, [%[indices]] \n" + "1: \n" + "ldp q0, q1, [%[dst]] \n" + "movi v2.4s, #0 \n" + "movi v3.4s, #0 \n" + "movi v4.4s, #0 \n" + "movi v5.4s, #0 \n" + "movi v6.4s, #0 \n" + "movi v7.4s, #0 \n" + "udot v2.4s, v0.16b, v20.16b \n" + "udot v3.4s, v1.16b, v20.16b \n" + "udot v4.4s, v0.16b, v21.16b \n" + "udot v5.4s, v1.16b, v21.16b \n" + "udot v6.4s, v0.16b, v22.16b \n" + "udot v7.4s, v1.16b, v22.16b \n" + "subs %w1, %w1, #8 \n" + "prfm pldl1keep, [%[dst], 448] \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "uzp1 v5.8h, v4.8h, v5.8h \n" + "uzp1 v4.8h, v2.8h, v3.8h \n" "tbl v3.16b, {v0.16b, v1.16b}, v23.16b \n" - "uqshrn v0.8b, v4.8h, #7 \n" - "uqshrn v1.8b, v5.8h, #7 \n" - "uqshrn v2.8b, v6.8h, #7 \n" + "uqshrn v0.8b, v4.8h, #7 \n" + "uqshrn v1.8b, v5.8h, #7 \n" + "uqshrn v2.8b, v6.8h, #7 \n" "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[dst]], #32 \n" - "b.gt 1b \n" + "b.gt 1b \n" : [dst] "+r"(dst_argb), // %[dst] [width] "+r"(width) // %[width] : [coeffs] "r"(&kARGBSepiaRowCoeffs), // %[coeffs] @@ -4306,7 +4313,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile ( + asm volatile( "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. @@ -4365,51 +4372,51 @@ void ARGBColorMatrixRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { - asm volatile ( - "ld1 {v31.16b}, [%[matrix_argb]] \n" + asm volatile( + "ld1 {v31.16b}, [%[matrix_argb]] \n" - "1: \n" - "ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n" + "1: \n" + "ld1 {v0.16b, v1.16b}, [%[src_argb]], #32 \n" - "movi v16.4s, #0 \n" - "movi v17.4s, #0 \n" - "movi v18.4s, #0 \n" - "movi v19.4s, #0 \n" - "movi v20.4s, #0 \n" - "movi v21.4s, #0 \n" - "movi v22.4s, #0 \n" - "movi v23.4s, #0 \n" + "movi v16.4s, #0 \n" + "movi v17.4s, #0 \n" + "movi v18.4s, #0 \n" + "movi v19.4s, #0 \n" + "movi v20.4s, #0 \n" + "movi v21.4s, #0 \n" + "movi v22.4s, #0 \n" + "movi v23.4s, #0 \n" // 8 processed per loop. - "subs %w2, %w2, #8 \n" - "prfm pldl1keep, [%[src_argb], 448] \n" + "subs %w2, %w2, #8 \n" + "prfm pldl1keep, [%[src_argb], 448] \n" - "sudot v16.4s, v31.16b, v0.4b[0] \n" - "sudot v17.4s, v31.16b, v0.4b[1] \n" - "sudot v18.4s, v31.16b, v0.4b[2] \n" - "sudot v19.4s, v31.16b, v0.4b[3] \n" - "sudot v20.4s, v31.16b, v1.4b[0] \n" - "sudot v21.4s, v31.16b, v1.4b[1] \n" - "sudot v22.4s, v31.16b, v1.4b[2] \n" - "sudot v23.4s, v31.16b, v1.4b[3] \n" + "sudot v16.4s, v31.16b, v0.4b[0] \n" + "sudot v17.4s, v31.16b, v0.4b[1] \n" + "sudot v18.4s, v31.16b, v0.4b[2] \n" + "sudot v19.4s, v31.16b, v0.4b[3] \n" + "sudot v20.4s, v31.16b, v1.4b[0] \n" + "sudot v21.4s, v31.16b, v1.4b[1] \n" + "sudot v22.4s, v31.16b, v1.4b[2] \n" + "sudot v23.4s, v31.16b, v1.4b[3] \n" - "shrn v16.4h, v16.4s, #6 \n" - "shrn v18.4h, v18.4s, #6 \n" - "shrn v20.4h, v20.4s, #6 \n" - "shrn v22.4h, v22.4s, #6 \n" - "shrn2 v16.8h, v17.4s, #6 \n" - "shrn2 v18.8h, v19.4s, #6 \n" - "shrn2 v20.8h, v21.4s, #6 \n" - "shrn2 v22.8h, v23.4s, #6 \n" + "shrn v16.4h, v16.4s, #6 \n" + "shrn v18.4h, v18.4s, #6 \n" + "shrn v20.4h, v20.4s, #6 \n" + "shrn v22.4h, v22.4s, #6 \n" + "shrn2 v16.8h, v17.4s, #6 \n" + "shrn2 v18.8h, v19.4s, #6 \n" + "shrn2 v20.8h, v21.4s, #6 \n" + "shrn2 v22.8h, v23.4s, #6 \n" - "uqxtn v16.8b, v16.8h \n" - "uqxtn v18.8b, v18.8h \n" - "uqxtn v20.8b, v20.8h \n" - "uqxtn v22.8b, v22.8h \n" + "uqxtn v16.8b, v16.8h \n" + "uqxtn v18.8b, v18.8h \n" + "uqxtn v20.8b, v20.8h \n" + "uqxtn v22.8b, v22.8h \n" - "stp d16, d18, [%[dst_argb]], #16 \n" - "stp d20, d22, [%[dst_argb]], #16 \n" - "b.gt 1b \n" + "stp d16, d18, [%[dst_argb]], #16 \n" + "stp d20, d22, [%[dst_argb]], #16 \n" + "b.gt 1b \n" : [src_argb] "+r"(src_argb), // %[src_argb] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] @@ -4423,7 +4430,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 8 pixel loop. "1: \n" "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB @@ -4454,7 +4461,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 8 pixel loop. "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB @@ -4479,7 +4486,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( // 8 pixel loop. "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB @@ -4508,7 +4515,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -4535,7 +4542,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( // 16 pixel loop. "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. @@ -4563,7 +4570,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { - asm volatile ( + asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" @@ -4592,7 +4599,7 @@ void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" @@ -4633,7 +4640,7 @@ void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v1.8b}, [%1],%4 \n" @@ -4702,7 +4709,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes "subs %w2, %w2, #8 \n" // 8 pixels per loop @@ -4727,7 +4734,7 @@ void ByteToFloatRow_NEON(const uint8_t* src, void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16 float* dst, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats "subs %w2, %w2, #8 \n" // 8 floats per loop @@ -4749,7 +4756,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 int src_stride, // stride in elements float* dst, int width) { - asm volatile ( + asm volatile( "cmp %w2, #8 \n" // Is there 8 rows? "b.lo 2f \n" "1: \n" @@ -4787,7 +4794,7 @@ void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16 void ConvertFP32ToFP16Row_NEON(const float* src, uint16_t* dst, // fp16 int width) { - asm volatile ( + asm volatile( "1: \n" "ldp q2, q3, [%0], #32 \n" // load 8 floats "subs %w2, %w2, #8 \n" // 8 floats per loop @@ -4808,7 +4815,7 @@ float ScaleMaxSamples_NEON(const float* src, float scale, int width) { float fmax; - asm volatile ( + asm volatile( "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" @@ -4838,7 +4845,7 @@ float ScaleSumSamples_NEON(const float* src, float scale, int width) { float fsum; - asm volatile ( + asm volatile( "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" // max @@ -4865,7 +4872,7 @@ float ScaleSumSamples_NEON(const float* src, } void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "prfm pldl1keep, [%0, 448] \n" @@ -4889,7 +4896,7 @@ void GaussCol_NEON(const uint16_t* src0, const uint16_t* src4, uint32_t* dst, int width) { - asm volatile ( + asm volatile( "movi v6.8h, #4 \n" // constant 4 "movi v7.8h, #6 \n" // constant 6 @@ -4931,7 +4938,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src1 = src + 1; const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; - asm volatile ( + asm volatile( "movi v6.4s, #4 \n" // constant 4 "movi v7.4s, #6 \n" // constant 6 @@ -4974,7 +4981,7 @@ void GaussCol_F32_NEON(const float* src0, const float* src4, float* dst, int width) { - asm volatile ( + asm volatile( "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 "1: \n" @@ -5012,7 +5019,7 @@ void GaussCol_F32_NEON(const float* src0, // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_F32_NEON(const float* src, float* dst, int width) { - asm volatile ( + asm volatile( "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 "1: \n" @@ -5051,7 +5058,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values @@ -5082,7 +5089,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - asm volatile ( + asm volatile( "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values @@ -5112,7 +5119,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_uv, int width) { const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv @@ -5141,7 +5148,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, uint8_t* dst_vu, int width) { const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv @@ -5167,7 +5174,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { - asm volatile ( + asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "subs %w2, %w2, #16 \n" // 16 pixels per loop @@ -5183,7 +5190,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" @@ -5208,7 +5215,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u, int width) { const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values @@ -5243,7 +5250,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, int depth, int width) { int shift = depth - 16; // Negative for right shift. - asm volatile ( + asm volatile( "dup v2.8h, %w4 \n" "1: \n" "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV @@ -5266,7 +5273,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" @@ -5287,7 +5294,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { - asm volatile ( + asm volatile( "dup v4.8h, %w3 \n" "1: \n" "ldp q2, q3, [%0], #32 \n" @@ -5321,7 +5328,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y, // saturate, then we can just use UZP2 to narrow rather than a pair of // saturating narrow instructions. int shift = 23 - __builtin_clz((int32_t)scale); - asm volatile ( + asm volatile( "dup v2.8h, %w3 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 62c6b2631..07606d7a8 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -47,7 +47,7 @@ extern "C" { // register) is set to round-to-nearest-up mode(0). #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ { \ - asm volatile ("csrwi vxrm, 0"); \ + asm volatile("csrwi vxrm, 0"); \ ub = yuvconst->kUVCoeff[0]; \ vr = yuvconst->kUVCoeff[1]; \ ug = yuvconst->kUVCoeff[2]; \ @@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y, vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); // To match behavior on other platforms, vxrm (fixed-point rounding mode // register) sets to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); if (is_yb_positive) { v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); } else { @@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr, } // To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); // Blend 50 / 50. if (y1_fraction == 128) { do { diff --git a/source/row_sve.cc b/source/row_sve.cc index 20b9c4bde..8076c9ebc 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -241,7 +241,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, const int16_t* uvconstants) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; uint64_t vl; - asm volatile ( + asm volatile( "ptrue p0.b \n" "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" diff --git a/source/scale_argb.cc b/source/scale_argb.cc index da99febb9..c009e0574 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -10,8 +10,8 @@ #include "libyuv/scale.h" -#include #include +#include #include #include #include @@ -1233,10 +1233,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y, (void)src_fourcc; // TODO(fbarchard): implement and/or assert. (void)dst_fourcc; const int abs_src_height = (src_height < 0) ? -src_height : src_height; - if (!src_y || !src_u || !src_v || !dst_argb || - src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 || - dst_width <= 0 || dst_height <= 0 || - clip_width <= 0 || clip_height <= 0) { + if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 || + src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 || + dst_height <= 0 || clip_width <= 0 || clip_height <= 0) { return -1; } const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4; @@ -1250,9 +1249,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y, I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, argb_buffer, src_width * 4, src_width, src_height); - r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, dst_argb, - dst_stride_argb, dst_width, dst_height, clip_x, clip_y, - clip_width, clip_height, filtering); + r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, + dst_argb, dst_stride_argb, dst_width, dst_height, clip_x, + clip_y, clip_width, clip_height, filtering); free(argb_buffer); return r; } diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 27cdc17aa..c5dabd409 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( // 16 pixel loop. LABELALIGN "1: \n" @@ -123,7 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" @@ -154,7 +154,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" @@ -195,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" @@ -221,7 +221,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" @@ -254,7 +254,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" @@ -297,7 +297,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" "pslld $0x10,%%xmm5 \n" @@ -328,7 +328,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { intptr_t stridex3; - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" @@ -383,7 +383,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n" @@ -416,7 +416,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsllw $0x3,%%ymm4,%%ymm5 \n" @@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "movdqa %0,%%xmm3 \n" "movdqa %1,%%xmm4 \n" "movdqa %2,%%xmm5 \n" @@ -481,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" @@ -508,7 +508,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 @@ -517,7 +517,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); - asm volatile ( + asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 @@ -526,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, "m"(kMadd11), // %1 "m"(kRound34) // %2 ); - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -572,7 +572,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 @@ -581,7 +581,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); - asm volatile ( + asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 @@ -591,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, "m"(kRound34) // %2 ); - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" @@ -641,7 +641,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" @@ -671,7 +671,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" @@ -682,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" @@ -714,7 +714,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" @@ -724,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" @@ -782,7 +782,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pxor %%xmm0,%%xmm0 \n" // 0 "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" @@ -838,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "1: \n" "pxor %%xmm0,%%xmm0 \n" // 0 // above line @@ -951,7 +951,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "movdqa %3,%%xmm5 \n" "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" @@ -1003,7 +1003,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqw %%xmm7,%%xmm7 \n" "psrlw $15,%%xmm7 \n" "psllw $3,%%xmm7 \n" // all 8 @@ -1101,7 +1101,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" @@ -1154,7 +1154,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" @@ -1262,7 +1262,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 @@ -1303,7 +1303,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 @@ -1388,7 +1388,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 @@ -1432,7 +1432,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 @@ -1514,7 +1514,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vbroadcastf128 %3,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" @@ -1566,7 +1566,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" @@ -1628,7 +1628,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -1678,7 +1678,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 @@ -1761,11 +1761,10 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile ( - "pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" - // 16 pixel loop. - LABELALIGN + // 16 pixel loop. + LABELALIGN "1: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" // src_ptr += 16 @@ -1781,11 +1780,11 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEADDROW_AVX2 @@ -1793,10 +1792,9 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile ( - "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN + LABELALIGN "1: \n" "vmovdqu (%0),%%ymm3 \n" "lea 0x20(%0),%0 \n" // src_ptr += 32 @@ -1811,11 +1809,11 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(src_width) // %2 - : - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(src_width) // %2 + : + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 @@ -1835,7 +1833,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, int x, int dx) { intptr_t x0, x1, temp_pixel; - asm volatile ( + asm volatile( "movd %6,%%xmm2 \n" "movd %7,%%xmm3 \n" "movl $0x04040000,%k2 \n" @@ -1932,7 +1930,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr, int dx) { (void)x; (void)dx; - asm volatile ( + asm volatile( "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -1957,7 +1955,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -1979,7 +1977,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2003,7 +2001,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { - asm volatile ( + asm volatile( "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" @@ -2037,7 +2035,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; (void)src_stride; - asm volatile ( + asm volatile( "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" @@ -2074,7 +2072,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); - asm volatile ( + asm volatile( "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%0,%5,1),%5 \n" @@ -2117,7 +2115,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb, int x, int dx) { intptr_t x0, x1; - asm volatile ( + asm volatile( "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" @@ -2188,7 +2186,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, int dx) { (void)x; (void)dx; - asm volatile ( + asm volatile( "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" @@ -2226,7 +2224,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, int x, int dx) { intptr_t x0, x1; - asm volatile ( + asm volatile( "movdqa %0,%%xmm4 \n" "movdqa %1,%%xmm5 \n" : @@ -2234,7 +2232,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, "m"(kShuffleFractions) // %1 ); - asm volatile ( + asm volatile( "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pcmpeqb %%xmm6,%%xmm6 \n" @@ -2297,7 +2295,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { - asm volatile ( + asm volatile( "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" @@ -2311,7 +2309,7 @@ int FixedDiv_X86(int num, int div) { // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { - asm volatile ( + asm volatile( "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" @@ -2343,7 +2341,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" @@ -2383,7 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" @@ -2427,7 +2425,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 @@ -2468,7 +2466,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 @@ -2552,7 +2550,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 @@ -2595,7 +2593,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 @@ -2675,7 +2673,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" @@ -2727,7 +2725,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" @@ -2818,7 +2816,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 @@ -2867,7 +2865,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { - asm volatile ( + asm volatile( "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 diff --git a/source/scale_neon.cc b/source/scale_neon.cc index ba25fc6ec..88378c575 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" // load even pixels into q0, odd into q1 "vld2.8 {q0, q1}, [%0]! \n" @@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile ( + asm volatile( // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" @@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop @@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; - asm volatile ( + asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q1}, [%3]! \n" @@ -155,7 +155,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" @@ -173,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" @@ -230,7 +230,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" @@ -282,7 +282,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "vld1.8 {q3}, [%3] \n" "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" @@ -306,7 +306,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride * 2; - asm volatile ( + asm volatile( "vld1.16 {q13}, [%5] \n" "vld1.8 {q14}, [%6] \n" "vld1.8 {q15}, [%7] \n" @@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { - asm volatile ( + asm volatile( "vld1.16 {q13}, [%4] \n" "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" @@ -509,7 +509,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; - asm volatile ( + asm volatile( "vmov.u8 d30, #3 \n" "1: \n" @@ -546,7 +546,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp1 = src_ptr1 + 1; - asm volatile ( + asm volatile( "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" @@ -608,7 +608,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile ( + asm volatile( "vmov.u16 q15, #3 \n" "1: \n" @@ -644,7 +644,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile ( + asm volatile( "vmov.u16 q15, #3 \n" "1: \n" @@ -695,7 +695,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile ( + asm volatile( "vmov.u16 d31, #3 \n" "1: \n" @@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile ( + asm volatile( "vmov.u16 d31, #3 \n" "vmov.u32 q14, #3 \n" @@ -791,7 +791,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 2; - asm volatile ( + asm volatile( "vmov.u8 d30, #3 \n" "1: \n" @@ -828,7 +828,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp1 = src_ptr1 + 2; - asm volatile ( + asm volatile( "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" @@ -890,7 +890,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 2; - asm volatile ( + asm volatile( "vmov.u16 d30, #3 \n" "1: \n" @@ -935,7 +935,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp1 = src_ptr1 + 2; - asm volatile ( + asm volatile( "vmov.u16 d30, #3 \n" "vmov.u32 q14, #3 \n" @@ -988,7 +988,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile ( + asm volatile( "1: \n" "vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.8 {q0}, [%0]! \n" // load 16 bytes @@ -1086,7 +1086,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB @@ -1114,7 +1114,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB @@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile ( + asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -1174,7 +1174,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "mov r12, %3, lsl #2 \n" "1: \n" "vld1.32 {d0[0]}, [%0], r12 \n" @@ -1198,7 +1198,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, int src_stepx, uint8_t* dst_argb, int dst_width) { - asm volatile ( + asm volatile( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" "1: \n" @@ -1246,7 +1246,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb, int dx) { int tmp; const uint8_t* src_tmp = src_argb; - asm volatile ( + asm volatile( "1: \n" // clang-format off LOAD1_DATA32_LANE(d0, 0) @@ -1349,7 +1349,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV @@ -1368,7 +1368,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV @@ -1387,7 +1387,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile ( + asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -1422,7 +1422,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n" diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 69c51b1bb..de19989fc 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" @@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" @@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile ( + asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -172,18 +172,18 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "ld1 {v29.16b}, [%[kShuf34_0]] \n" - "ld1 {v30.16b}, [%[kShuf34_1]] \n" - "ld1 {v31.16b}, [%[kShuf34_2]] \n" - "1: \n" + "ld1 {v29.16b}, [%[kShuf34_0]] \n" + "ld1 {v30.16b}, [%[kShuf34_1]] \n" + "ld1 {v31.16b}, [%[kShuf34_2]] \n" + "1: \n" "ld1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%[src_ptr]], #64 \n" - "subs %w[width], %w[width], #48 \n" - "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" - "prfm pldl1keep, [%[src_ptr], 448] \n" - "tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n" - "tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n" - "st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n" - "b.gt 1b \n" + "subs %w[width], %w[width], #48 \n" + "tbl v0.16b, {v0.16b, v1.16b}, v29.16b \n" + "prfm pldl1keep, [%[src_ptr], 448] \n" + "tbl v1.16b, {v1.16b, v2.16b}, v30.16b \n" + "tbl v2.16b, {v2.16b, v3.16b}, v31.16b \n" + "st1 {v0.16b,v1.16b,v2.16b}, [%[dst_ptr]], #48 \n" + "b.gt 1b \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [width] "+r"(dst_width) // %[width] @@ -326,7 +326,7 @@ static const vec16 kMult38_Div664 = { 65536 / 12, 65536 / 12, 65536 / 8, 65536 / 12, 65536 / 12, 65536 / 8, 0, 0}; static const vec16 kMult38_Div996 = {65536 / 18, 65536 / 18, 65536 / 12, 65536 / 18, 65536 / 18, 65536 / 12, - 0, 0}; + 0, 0}; // 32 -> 12 void ScaleRowDown38_NEON(const uint8_t* src_ptr, @@ -335,26 +335,26 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "ld1 {v3.16b}, [%[kShuf38]] \n" - "subs %w[width], %w[width], #12 \n" - "b.eq 2f \n" + "ld1 {v3.16b}, [%[kShuf38]] \n" + "subs %w[width], %w[width], #12 \n" + "b.eq 2f \n" - "1: \n" - "ldp q0, q1, [%[src_ptr]], #32 \n" - "subs %w[width], %w[width], #12 \n" - "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" - "prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead - "str q2, [%[dst_ptr]] \n" - "add %[dst_ptr], %[dst_ptr], #12 \n" - "b.gt 1b \n" + "1: \n" + "ldp q0, q1, [%[src_ptr]], #32 \n" + "subs %w[width], %w[width], #12 \n" + "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" + "prfm pldl1keep, [%[src_ptr], 448] \n" // prefetch 7 lines ahead + "str q2, [%[dst_ptr]] \n" + "add %[dst_ptr], %[dst_ptr], #12 \n" + "b.gt 1b \n" // Store exactly 12 bytes on the final iteration to avoid writing past // the end of the array. - "2: \n" - "ldp q0, q1, [%[src_ptr]] \n" - "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" - "st1 {v2.8b}, [%[dst_ptr]], #8 \n" - "st1 {v2.s}[2], [%[dst_ptr]] \n" + "2: \n" + "ldp q0, q1, [%[src_ptr]] \n" + "tbl v2.16b, {v0.16b, v1.16b}, v3.16b \n" + "st1 {v2.8b}, [%[dst_ptr]], #8 \n" + "st1 {v2.s}[2], [%[dst_ptr]] \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [width] "+r"(dst_width) // %[width] @@ -378,49 +378,49 @@ void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; asm volatile( - "ld1 {v27.16b}, [%[tblArray1]] \n" - "ld1 {v28.16b}, [%[tblArray2]] \n" - "ld1 {v29.16b}, [%[tblArray3]] \n" - "ld1 {v31.16b}, [%[tblArray4]] \n" - "ld1 {v30.16b}, [%[div996]] \n" + "ld1 {v27.16b}, [%[tblArray1]] \n" + "ld1 {v28.16b}, [%[tblArray2]] \n" + "ld1 {v29.16b}, [%[tblArray3]] \n" + "ld1 {v31.16b}, [%[tblArray4]] \n" + "ld1 {v30.16b}, [%[div996]] \n" - "1: \n" - "ldp q20, q0, [%[src_ptr]], #32 \n" - "ldp q21, q1, [%[src_ptr1]], #32 \n" - "ldp q22, q2, [%[src_ptr2]], #32 \n" + "1: \n" + "ldp q20, q0, [%[src_ptr]], #32 \n" + "ldp q21, q1, [%[src_ptr1]], #32 \n" + "ldp q22, q2, [%[src_ptr2]], #32 \n" - "subs %w[width], %w[width], #12 \n" + "subs %w[width], %w[width], #12 \n" // Add across strided rows first. - "uaddl v23.8h, v20.8b, v21.8b \n" - "uaddl v3.8h, v0.8b, v1.8b \n" - "uaddl2 v24.8h, v20.16b, v21.16b \n" - "uaddl2 v4.8h, v0.16b, v1.16b \n" + "uaddl v23.8h, v20.8b, v21.8b \n" + "uaddl v3.8h, v0.8b, v1.8b \n" + "uaddl2 v24.8h, v20.16b, v21.16b \n" + "uaddl2 v4.8h, v0.16b, v1.16b \n" - "uaddw v23.8h, v23.8h, v22.8b \n" - "uaddw v3.8h, v3.8h, v2.8b \n" - "uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ... - "uaddw2 v4.8h, v4.8h, v2.16b \n" + "uaddw v23.8h, v23.8h, v22.8b \n" + "uaddw v3.8h, v3.8h, v2.8b \n" + "uaddw2 v24.8h, v24.8h, v22.16b \n" // abcdefgh ... + "uaddw2 v4.8h, v4.8h, v2.16b \n" // Permute groups of {three,three,two} into separate vectors to sum. - "tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ... - "tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n" - "tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ... - "tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n" - "tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0... - "tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n" + "tbl v20.16b, {v23.16b, v24.16b}, v27.16b \n" // a d g ... + "tbl v0.16b, {v3.16b, v4.16b}, v27.16b \n" + "tbl v21.16b, {v23.16b, v24.16b}, v28.16b \n" // b e h ... + "tbl v1.16b, {v3.16b, v4.16b}, v28.16b \n" + "tbl v22.16b, {v23.16b, v24.16b}, v29.16b \n" // c f 0... + "tbl v2.16b, {v3.16b, v4.16b}, v29.16b \n" - "add v23.8h, v20.8h, v21.8h \n" - "add v3.8h, v0.8h, v1.8h \n" - "add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h - "add v4.8h, v3.8h, v2.8h \n" + "add v23.8h, v20.8h, v21.8h \n" + "add v3.8h, v0.8h, v1.8h \n" + "add v24.8h, v23.8h, v22.8h \n" // a+b+c d+e+f g+h + "add v4.8h, v3.8h, v2.8h \n" - "sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6} - "sqrdmulh v25.8h, v4.8h, v30.8h \n" - "tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow. - "st1 {v21.d}[0], [%[dst_ptr]], #8 \n" - "st1 {v21.s}[2], [%[dst_ptr]], #4 \n" - "b.gt 1b \n" + "sqrdmulh v24.8h, v24.8h, v30.8h \n" // v /= {9,9,6} + "sqrdmulh v25.8h, v4.8h, v30.8h \n" + "tbl v21.16b, {v24.16b, v25.16b}, v31.16b \n" // Narrow. + "st1 {v21.d}[0], [%[dst_ptr]], #8 \n" + "st1 {v21.s}[2], [%[dst_ptr]], #4 \n" + "b.gt 1b \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] @@ -446,41 +446,41 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( - "ld1 {v28.16b}, [%[tblArray1]] \n" - "ld1 {v29.16b}, [%[tblArray2]] \n" - "ld1 {v31.16b}, [%[tblArray3]] \n" - "ld1 {v30.8h}, [%[div664]] \n" + "ld1 {v28.16b}, [%[tblArray1]] \n" + "ld1 {v29.16b}, [%[tblArray2]] \n" + "ld1 {v31.16b}, [%[tblArray3]] \n" + "ld1 {v30.8h}, [%[div664]] \n" - "1: \n" - "ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ... - "ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ... - "subs %w[width], %w[width], #12 \n" + "1: \n" + "ldp q20, q0, [%[src_ptr]], #32 \n" // abcdefgh ... + "ldp q21, q1, [%[src_ptr1]], #32 \n" // ijklmnop ... + "subs %w[width], %w[width], #12 \n" // Permute into groups of six values (three pairs) to be summed. - "tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ... - "tbl v2.16b, {v0.16b}, v28.16b \n" - "tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ... - "tbl v3.16b, {v1.16b}, v28.16b \n" - "tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ... - "tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n" + "tbl v22.16b, {v20.16b}, v28.16b \n" // abdegh ... + "tbl v2.16b, {v0.16b}, v28.16b \n" + "tbl v23.16b, {v21.16b}, v28.16b \n" // ijlmop ... + "tbl v3.16b, {v1.16b}, v28.16b \n" + "tbl v24.16b, {v20.16b, v21.16b}, v29.16b \n" // ckfn00 ... + "tbl v4.16b, {v0.16b, v1.16b}, v29.16b \n" - "uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ... - "uaddlp v2.8h, v2.16b \n" - "uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ... - "uaddlp v3.8h, v3.16b \n" - "uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ... - "uaddlp v4.8h, v4.16b \n" - "add v20.8h, v22.8h, v23.8h \n" - "add v0.8h, v2.8h, v3.8h \n" - "add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ... - "add v1.8h, v0.8h, v4.8h \n" + "uaddlp v22.8h, v22.16b \n" // a+b d+e g+h ... + "uaddlp v2.8h, v2.16b \n" + "uaddlp v23.8h, v23.16b \n" // i+j l+m o+p ... + "uaddlp v3.8h, v3.16b \n" + "uaddlp v24.8h, v24.16b \n" // c+k f+n 0 ... + "uaddlp v4.8h, v4.16b \n" + "add v20.8h, v22.8h, v23.8h \n" + "add v0.8h, v2.8h, v3.8h \n" + "add v21.8h, v20.8h, v24.8h \n" // a+b+i+j+c+k ... + "add v1.8h, v0.8h, v4.8h \n" - "sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4} - "sqrdmulh v22.8h, v1.8h, v30.8h \n" - "tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow. - "st1 {v21.d}[0], [%[dst_ptr]], #8 \n" - "st1 {v21.s}[2], [%[dst_ptr]], #4 \n" - "b.gt 1b \n" + "sqrdmulh v21.8h, v21.8h, v30.8h \n" // v /= {6,6,4} + "sqrdmulh v22.8h, v1.8h, v30.8h \n" + "tbl v21.16b, {v21.16b, v22.16b}, v31.16b \n" // Narrow. + "st1 {v21.d}[0], [%[dst_ptr]], #8 \n" + "st1 {v21.s}[2], [%[dst_ptr]], #4 \n" + "b.gt 1b \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] [dst_ptr] "+r"(dst_ptr), // %[dst_ptr] [src_ptr1] "+r"(src_ptr1), // %[src_ptr1] @@ -543,7 +543,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp1 = src_ptr1 + 1; - asm volatile ( + asm volatile( "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" @@ -599,7 +599,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile ( + asm volatile( "movi v31.8h, #3 \n" "1: \n" @@ -636,7 +636,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile ( + asm volatile( "movi v31.8h, #3 \n" "1: \n" @@ -690,7 +690,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; - asm volatile ( + asm volatile( "movi v31.8h, #3 \n" "1: \n" @@ -735,7 +735,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; - asm volatile ( + asm volatile( "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" @@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 2; - asm volatile ( + asm volatile( "movi v31.8b, #3 \n" "1: \n" @@ -829,7 +829,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp1 = src_ptr1 + 2; - asm volatile ( + asm volatile( "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" @@ -885,7 +885,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 2; - asm volatile ( + asm volatile( "movi v31.8h, #3 \n" "1: \n" @@ -932,7 +932,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp1 = src_ptr1 + 2; - asm volatile ( + asm volatile( "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" @@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile ( + asm volatile( "1: \n" "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes @@ -1043,14 +1043,14 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr, "trn1 v21.8h, v2.8h, v0.8h \n" "1: \n" SCALE_FILTER_COLS_STEP_ADDR - "ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR - "ld1 {v6.h}[7], [%[tmp_ptr]] \n" + "ldr h6, [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[1], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[2], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[3], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[4], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[5], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[6], [%[tmp_ptr]] \n" SCALE_FILTER_COLS_STEP_ADDR + "ld1 {v6.h}[7], [%[tmp_ptr]] \n" "subs %w[width], %w[width], #8 \n" // 8 processed per loop "trn1 v4.16b, v6.16b, v0.16b \n" @@ -1090,14 +1090,14 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" + "1: \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[src]], #64 \n" - "subs %w[width], %w[width], #8 \n" - "prfm pldl1keep, [%[src], 448] \n" - "uzp2 v0.4s, v0.4s, v1.4s \n" - "uzp2 v1.4s, v2.4s, v3.4s \n" - "st1 {v0.4s, v1.4s}, [%[dst]], #32 \n" - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" + "prfm pldl1keep, [%[src], 448] \n" + "uzp2 v0.4s, v0.4s, v1.4s \n" + "uzp2 v1.4s, v2.4s, v3.4s \n" + "st1 {v0.4s, v1.4s}, [%[dst]], #32 \n" + "b.gt 1b \n" : [src] "+r"(src_ptr), // %[src] [dst] "+r"(dst), // %[dst] [width] "+r"(dst_width) // %[width] @@ -1113,15 +1113,15 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, const uint8_t* src_argb1 = src_argb + 32; asm volatile( "1: \n" - "ld2 {v0.4s, v1.4s}, [%[src]] \n" - "add %[src], %[src], #64 \n" - "ld2 {v2.4s, v3.4s}, [%[src1]] \n" - "add %[src1], %[src1], #64 \n" - "urhadd v0.16b, v0.16b, v1.16b \n" - "urhadd v1.16b, v2.16b, v3.16b \n" - "subs %w[width], %w[width], #8 \n" - "st1 {v0.16b, v1.16b}, [%[dst]], #32 \n" - "b.gt 1b \n" + "ld2 {v0.4s, v1.4s}, [%[src]] \n" + "add %[src], %[src], #64 \n" + "ld2 {v2.4s, v3.4s}, [%[src1]] \n" + "add %[src1], %[src1], #64 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v1.16b, v2.16b, v3.16b \n" + "subs %w[width], %w[width], #8 \n" + "st1 {v0.16b, v1.16b}, [%[dst]], #32 \n" + "b.gt 1b \n" : [src] "+r"(src_argb), // %[src] [src1] "+r"(src_argb1), // %[src1] [dst] "+r"(dst_argb), // %[dst] @@ -1135,21 +1135,21 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; - asm volatile ( - "1: \n" - "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" - "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" - "uaddl v2.8h, v0.8b, v1.8b \n" - "uaddl2 v3.8h, v0.16b, v1.16b \n" - "uaddl v22.8h, v20.8b, v21.8b \n" - "uaddl2 v23.8h, v20.16b, v21.16b \n" - "add v0.8h, v2.8h, v22.8h \n" - "add v1.8h, v3.8h, v23.8h \n" - "rshrn v0.8b, v0.8h, #2 \n" - "rshrn v1.8b, v1.8h, #2 \n" - "subs %w[width], %w[width], #4 \n" - "stp d0, d1, [%[dst]], #16 \n" - "b.gt 1b \n" + asm volatile( + "1: \n" + "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" + "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" + "uaddl v2.8h, v0.8b, v1.8b \n" + "uaddl2 v3.8h, v0.16b, v1.16b \n" + "uaddl v22.8h, v20.8b, v21.8b \n" + "uaddl2 v23.8h, v20.16b, v21.16b \n" + "add v0.8h, v2.8h, v22.8h \n" + "add v1.8h, v3.8h, v23.8h \n" + "rshrn v0.8b, v0.8h, #2 \n" + "rshrn v1.8b, v1.8h, #2 \n" + "subs %w[width], %w[width], #4 \n" + "stp d0, d1, [%[dst]], #16 \n" + "b.gt 1b \n" : [src] "+r"(src_ptr), [src1] "+r"(src_ptr1), [dst] "+r"(dst), [width] "+r"(dst_width) : @@ -1166,26 +1166,22 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, const uint8_t* src_argb3 = src_argb + src_stepx * 12; int64_t i = 0; (void)src_stride; - asm volatile ( - "1: \n" - "ldr w10, [%[src], %[i]] \n" - "ldr w11, [%[src1], %[i]] \n" - "ldr w12, [%[src2], %[i]] \n" - "ldr w13, [%[src3], %[i]] \n" - "add %[i], %[i], %[step] \n" - "subs %w[width], %w[width], #4 \n" - "prfm pldl1keep, [%[src], 448] \n" - "stp w10, w11, [%[dst]], #8 \n" - "stp w12, w13, [%[dst]], #8 \n" - "b.gt 1b \n" - : [src]"+r"(src_argb), - [src1]"+r"(src_argb1), - [src2]"+r"(src_argb2), - [src3]"+r"(src_argb3), - [dst]"+r"(dst_argb), - [width]"+r"(dst_width), - [i]"+r"(i) - : [step]"r"((int64_t)(src_stepx * 16)) + asm volatile( + "1: \n" + "ldr w10, [%[src], %[i]] \n" + "ldr w11, [%[src1], %[i]] \n" + "ldr w12, [%[src2], %[i]] \n" + "ldr w13, [%[src3], %[i]] \n" + "add %[i], %[i], %[step] \n" + "subs %w[width], %w[width], #4 \n" + "prfm pldl1keep, [%[src], 448] \n" + "stp w10, w11, [%[dst]], #8 \n" + "stp w12, w13, [%[dst]], #8 \n" + "b.gt 1b \n" + : [src] "+r"(src_argb), [src1] "+r"(src_argb1), [src2] "+r"(src_argb2), + [src3] "+r"(src_argb3), [dst] "+r"(dst_argb), [width] "+r"(dst_width), + [i] "+r"(i) + : [step] "r"((int64_t)(src_stepx * 16)) : "memory", "cc", "w10", "w11", "w12", "w13"); } @@ -1312,33 +1308,33 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, "1: \n" // SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ldr d1, [%6] \n" // + "ldr d1, [%6] \n" // SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ldr d2, [%6] \n" - "shrn v4.4h, v5.4s, #9 \n" // + "ldr d2, [%6] \n" + "shrn v4.4h, v5.4s, #9 \n" // SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ld1 {v1.d}[1], [%6] \n" // + "ld1 {v1.d}[1], [%6] \n" // SCALE_ARGB_FILTER_COLS_STEP_ADDR - "ld1 {v2.d}[1], [%6] \n" + "ld1 {v2.d}[1], [%6] \n" - "subs %w2, %w2, #4 \n" // 4 processed per loop - "and v4.8b, v4.8b, v3.8b \n" - "trn1 v0.4s, v1.4s, v2.4s \n" - "tbl v4.16b, {v4.16b}, v18.16b \n" // f - "trn2 v1.4s, v1.4s, v2.4s \n" - "eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f + "subs %w2, %w2, #4 \n" // 4 processed per loop + "and v4.8b, v4.8b, v3.8b \n" + "trn1 v0.4s, v1.4s, v2.4s \n" + "tbl v4.16b, {v4.16b}, v18.16b \n" // f + "trn2 v1.4s, v1.4s, v2.4s \n" + "eor v7.16b, v4.16b, v3.16b \n" // 0x7f ^ f - "umull v16.8h, v1.8b, v4.8b \n" - "umull2 v17.8h, v1.16b, v4.16b \n" - "umlal v16.8h, v0.8b, v7.8b \n" - "umlal2 v17.8h, v0.16b, v7.16b \n" + "umull v16.8h, v1.8b, v4.8b \n" + "umull2 v17.8h, v1.16b, v4.16b \n" + "umlal v16.8h, v0.8b, v7.8b \n" + "umlal2 v17.8h, v0.16b, v7.16b \n" - "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead - "shrn v0.8b, v16.8h, #7 \n" - "shrn v1.8b, v17.8h, #7 \n" - "add v5.4s, v5.4s, v6.4s \n" - "stp d0, d1, [%0], #16 \n" // store pixels - "b.gt 1b \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + "shrn v0.8b, v16.8h, #7 \n" + "shrn v1.8b, v17.8h, #7 \n" + "add v5.4s, v5.4s, v6.4s \n" + "stp d0, d1, [%0], #16 \n" // store pixels + "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 @@ -1360,34 +1356,34 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "subs %w[dst_width], %w[dst_width], #32 \n" - "b.lt 2f \n" + "subs %w[dst_width], %w[dst_width], #32 \n" + "b.lt 2f \n" "1: \n" - "ldp q0, q1, [%[src_ptr]] \n" - "ldp q2, q3, [%[src_ptr], #32] \n" - "ldp q4, q5, [%[src_ptr], #64] \n" - "ldp q6, q7, [%[src_ptr], #96] \n" - "add %[src_ptr], %[src_ptr], #128 \n" - "uzp2 v0.8h, v0.8h, v1.8h \n" - "uzp2 v1.8h, v2.8h, v3.8h \n" - "uzp2 v2.8h, v4.8h, v5.8h \n" - "uzp2 v3.8h, v6.8h, v7.8h \n" - "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. - "stp q0, q1, [%[dst_ptr]] \n" - "stp q2, q3, [%[dst_ptr], #32] \n" - "add %[dst_ptr], %[dst_ptr], #64 \n" - "b.ge 1b \n" + "ldp q0, q1, [%[src_ptr]] \n" + "ldp q2, q3, [%[src_ptr], #32] \n" + "ldp q4, q5, [%[src_ptr], #64] \n" + "ldp q6, q7, [%[src_ptr], #96] \n" + "add %[src_ptr], %[src_ptr], #128 \n" + "uzp2 v0.8h, v0.8h, v1.8h \n" + "uzp2 v1.8h, v2.8h, v3.8h \n" + "uzp2 v2.8h, v4.8h, v5.8h \n" + "uzp2 v3.8h, v6.8h, v7.8h \n" + "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. + "stp q0, q1, [%[dst_ptr]] \n" + "stp q2, q3, [%[dst_ptr], #32] \n" + "add %[dst_ptr], %[dst_ptr], #64 \n" + "b.ge 1b \n" "2: \n" - "adds %w[dst_width], %w[dst_width], #32 \n" - "b.eq 99f \n" + "adds %w[dst_width], %w[dst_width], #32 \n" + "b.eq 99f \n" - "ldp q0, q1, [%[src_ptr]] \n" - "ldp q2, q3, [%[src_ptr], #32] \n" - "uzp2 v0.8h, v0.8h, v1.8h \n" - "uzp2 v1.8h, v2.8h, v3.8h \n" - "stp q0, q1, [%[dst_ptr]] \n" + "ldp q0, q1, [%[src_ptr]] \n" + "ldp q2, q3, [%[src_ptr], #32] \n" + "uzp2 v0.8h, v0.8h, v1.8h \n" + "uzp2 v1.8h, v2.8h, v3.8h \n" + "stp q0, q1, [%[dst_ptr]] \n" "99: \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] @@ -1403,15 +1399,15 @@ void ScaleRowDown2Linear_16_NEON(const uint16_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "1: \n" - "ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n" - "ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n" - "subs %w[dst_width], %w[dst_width], #16 \n" - "urhadd v0.8h, v0.8h, v1.8h \n" - "urhadd v1.8h, v2.8h, v3.8h \n" - "prfm pldl1keep, [%[src_ptr], 448] \n" - "stp q0, q1, [%[dst_ptr]], #32 \n" - "b.gt 1b \n" + "1: \n" + "ld2 {v0.8h, v1.8h}, [%[src_ptr]], #32 \n" + "ld2 {v2.8h, v3.8h}, [%[src_ptr]], #32 \n" + "subs %w[dst_width], %w[dst_width], #16 \n" + "urhadd v0.8h, v0.8h, v1.8h \n" + "urhadd v1.8h, v2.8h, v3.8h \n" + "prfm pldl1keep, [%[src_ptr], 448] \n" + "stp q0, q1, [%[dst_ptr]], #32 \n" + "b.gt 1b \n" : [src_ptr] "+r"(src_ptr), // %[src_ptr] [dst_ptr] "+r"(dst), // %[dst_ptr] [dst_width] "+r"(dst_width) // %[dst_width] @@ -1424,7 +1420,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { - asm volatile ( + asm volatile( // change the stride to row 2 pointer "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "1: \n" @@ -1455,7 +1451,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1474,7 +1470,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, uint8_t* dst, int dst_width) { (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "subs %w2, %w2, #8 \n" // 8 processed per loop. @@ -1493,7 +1489,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { - asm volatile ( + asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" @@ -1528,7 +1524,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; - asm volatile ( + asm volatile( "1: \n" "ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n" diff --git a/source/scale_rgb.cc b/source/scale_rgb.cc index 225fd21ec..5e69fe379 100644 --- a/source/scale_rgb.cc +++ b/source/scale_rgb.cc @@ -10,8 +10,8 @@ #include "libyuv/scale.h" /* For FilterMode */ -#include #include +#include #include #include #include @@ -41,9 +41,9 @@ int RGBScale(const uint8_t* src_rgb, int dst_height, enum FilterMode filtering) { int r; - if (!src_rgb || !dst_rgb || - src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 || - dst_width <= 0 || dst_width > INT_MAX / 4 || dst_height <= 0) { + if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 || + src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 || + dst_height <= 0) { return -1; } const int abs_src_height = (src_height < 0) ? -src_height : src_height; diff --git a/source/scale_rvv.cc b/source/scale_rvv.cc index 9fe2b2773..4617e1a96 100644 --- a/source/scale_rvv.cc +++ b/source/scale_rvv.cc @@ -149,7 +149,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb, const uint32_t* src = (const uint32_t*)(src_argb); // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m4_t v_odd, v_even, v_dst; vuint32m4_t v_odd_32, v_even_32; @@ -214,7 +214,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb, const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst; vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16; @@ -311,7 +311,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb, const int stride_byte = src_stepx * 4; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst; vuint16m8_t v_row0_sum, v_row1_sum, v_sum; @@ -389,7 +389,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr, (void)src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m4_t v_s0, v_s1, v_dst; size_t vl = __riscv_vsetvl_e8m4(w); @@ -444,7 +444,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr, size_t w = (size_t)dst_width; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { size_t vl = __riscv_vsetvl_e8m4(w); vuint8m4_t v_s0, v_s1, v_t0, v_t1; @@ -577,7 +577,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr, size_t w = (size_t)dst_width; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_t0, v_t1, v_t2, v_t3; @@ -747,7 +747,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr, const uint8_t* t = src_ptr + src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; @@ -876,7 +876,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr, const uint8_t* t = src_ptr + src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; @@ -1539,7 +1539,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv, (void)src_stride; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m4_t v_u0v0, v_u1v1, v_avg; vuint16m4_t v_u0v0_16, v_u1v1_16; @@ -1608,7 +1608,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv, size_t w = (size_t)dst_width; // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // register) is set to round-to-nearest-up mode(0). - asm volatile ("csrwi vxrm, 0"); + asm volatile("csrwi vxrm, 0"); do { vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0; vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1; diff --git a/source/scale_sme.cc b/source/scale_sme.cc index 6b22f24d0..fa74569d1 100644 --- a/source/scale_sme.cc +++ b/source/scale_sme.cc @@ -15,7 +15,6 @@ namespace libyuv { extern "C" { #endif - #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ defined(__aarch64__) diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 7b318cf72..700d1b2b6 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -333,14 +333,14 @@ static void ScaleUVDownEven(int src_width, #endif #if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV) if (TestCpuFlag(kCpuHasRVV) && !filtering) { - #if defined(HAS_SCALEUVROWDOWNEVEN_RVV) - ScaleUVRowDownEven = ScaleUVRowDownEven_RVV; - #endif - #if defined(HAS_SCALEUVROWDOWN4_RVV) - if (col_step == 4) { - ScaleUVRowDownEven = ScaleUVRowDown4_RVV; - } - #endif +#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) + ScaleUVRowDownEven = ScaleUVRowDownEven_RVV; +#endif +#if defined(HAS_SCALEUVROWDOWN4_RVV) + if (col_step == 4) { + ScaleUVRowDownEven = ScaleUVRowDown4_RVV; + } +#endif } #endif diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index 8eadba39a..718afec36 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -12,6 +12,7 @@ #include #include +#include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/compare.h" #include "libyuv/convert.h" @@ -19,7 +20,6 @@ #include "libyuv/convert_from.h" #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" -#include "../unit_test/unit_test.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/video_common.h" diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 427614420..cb9bf1c40 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -67,16 +67,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) { #endif #ifdef __linux__ -static void KernelVersion(int *version) { +static void KernelVersion(int* version) { struct utsname buffer; int i = 0; version[0] = version[1] = 0; if (uname(&buffer) == 0) { - char *v = buffer.release; + char* v = buffer.release; for (i = 0; *v && i < 2; ++v) { if (isdigit(*v)) { - version[i++] = (int) strtol(v, &v, 10); + version[i++] = (int)strtol(v, &v, 10); } } } @@ -142,8 +142,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { // Read and print the RVV vector length. if (has_rvv) { - register uint32_t vlenb __asm__ ("t0"); - __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb)); + register uint32_t vlenb __asm__("t0"); + __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb)); printf("RVV vector length: %d bytes\n", vlenb); } } @@ -161,7 +161,7 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { #if defined(__loongarch__) int has_loongarch = TestCpuFlag(kCpuHasLOONGARCH); if (has_loongarch) { - int has_lsx = TestCpuFlag(kCpuHasLSX); + int has_lsx = TestCpuFlag(kCpuHasLSX); int has_lasx = TestCpuFlag(kCpuHasLASX); printf("Has LOONGARCH 0x%x\n", has_loongarch); printf("Has LSX 0x%x\n", has_lsx); @@ -169,8 +169,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { } #endif // defined(__loongarch__) -#if defined(__i386__) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_X64) +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(_M_X64) int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { int has_sse2 = TestCpuFlag(kCpuHasSSE2); @@ -215,7 +215,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) { printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); printf("Has AMXINT8 0x%x\n", has_amxint8); } -#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) +#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || + // defined(_M_X64) } TEST_F(LibYUVBaseTest, TestCompilerMacros) { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 30d660e4b..ca3cbe769 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1570,18 +1570,21 @@ static int TestCopyPlane(int benchmark_width, // Disable all optimizations. MaskCpuFlags(disable_cpu_flags); for (int i = 0; i < benchmark_iterations; i++) { - CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert); + CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, + benchmark_width, benchmark_height * invert); } // Enable optimizations. MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; i++) { - CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert); + CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, + benchmark_width, benchmark_height * invert); } int max_diff = 0; for (int i = 0; i < y_plane_size; ++i) { - int abs_diff = abs(static_cast(dst_c[i]) - static_cast(dst_opt[i])); + int abs_diff = + abs(static_cast(dst_c[i]) - static_cast(dst_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } @@ -1596,29 +1599,29 @@ static int TestCopyPlane(int benchmark_width, TEST_F(LibYUVPlanarTest, CopyPlane_Any) { int max_diff = TestCopyPlane(benchmark_width_ + 1, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, +1, 0); + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 0); } TEST_F(LibYUVPlanarTest, CopyPlane_Unaligned) { int max_diff = TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 0); } TEST_F(LibYUVPlanarTest, CopyPlane_Invert) { int max_diff = TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 0); } TEST_F(LibYUVPlanarTest, CopyPlane_Opt) { int max_diff = TestCopyPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 0); } @@ -2499,17 +2502,19 @@ static int TestHalfFloatPlane(int benchmark_width, // Disable all optimizations. MaskCpuFlags(disable_cpu_flags); for (j = 0; j < benchmark_iterations; j++) { - HalfFloatPlane(reinterpret_cast(orig_y + off), benchmark_width * 2, - reinterpret_cast(dst_c), benchmark_width * 2, - scale, benchmark_width, benchmark_height * invert); + HalfFloatPlane(reinterpret_cast(orig_y + off), + benchmark_width * 2, reinterpret_cast(dst_c), + benchmark_width * 2, scale, benchmark_width, + benchmark_height * invert); } // Enable optimizations. MaskCpuFlags(benchmark_cpu_info); for (j = 0; j < benchmark_iterations; j++) { - HalfFloatPlane(reinterpret_cast(orig_y + off), benchmark_width * 2, - reinterpret_cast(dst_opt), benchmark_width * 2, - scale, benchmark_width, benchmark_height * invert); + HalfFloatPlane(reinterpret_cast(orig_y + off), + benchmark_width * 2, reinterpret_cast(dst_opt), + benchmark_width * 2, scale, benchmark_width, + benchmark_height * invert); } int max_diff = 0; @@ -2536,23 +2541,23 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) { } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); EXPECT_EQ(0, diff); } @@ -2564,59 +2569,57 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) { } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) { - int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) { - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); EXPECT_EQ(0, diff); } #if defined(__arm__) static void EnableFlushDenormalToZero(void) { uint32_t cw; - asm volatile ( - "vmrs %0, fpscr \n" - "orr %0, %0, #0x1000000 \n" - "vmsr fpscr, %0 \n" - : "=r"(cw) - ::"memory", "cc"); // Clobber List + asm volatile( + "vmrs %0, fpscr \n" + "orr %0, %0, #0x1000000 \n" + "vmsr fpscr, %0 \n" + : "=r"(cw)::"memory", "cc"); // Clobber List } static void DisableFlushDenormalToZero(void) { uint32_t cw; - asm volatile ( - "vmrs %0, fpscr \n" - "bic %0, %0, #0x1000000 \n" - "vmsr fpscr, %0 \n" - : "=r"(cw) - ::"memory", "cc"); // Clobber List + asm volatile( + "vmrs %0, fpscr \n" + "bic %0, %0, #0x1000000 \n" + "vmsr fpscr, %0 \n" + : "=r"(cw)::"memory", "cc"); // Clobber List } // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes @@ -2626,18 +2629,18 @@ static void DisableFlushDenormalToZero(void) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) { // 32 bit arm rounding on denormal case is off by 1 compared to C. EnableFlushDenormalToZero(); - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); DisableFlushDenormalToZero(); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) { EnableFlushDenormalToZero(); - int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, - benchmark_iterations_, disable_cpu_flags_, - benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); + int diff = TestHalfFloatPlane( + benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); DisableFlushDenormalToZero(); EXPECT_EQ(0, diff); } @@ -3184,8 +3187,9 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) { tmp_pixels_c_b, benchmark_width_, benchmark_width_, benchmark_height_); MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g, - benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c, - benchmark_width_ * 3, benchmark_width_, benchmark_height_); + benchmark_width_, tmp_pixels_c_b, benchmark_width_, + dst_pixels_c, benchmark_width_ * 3, benchmark_width_, + benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r, @@ -3244,8 +3248,9 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { tmp_pixels_c_b, benchmark_width_, benchmark_width_, benchmark_height_); MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g, - benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c, - benchmark_width_ * 3, benchmark_width_, benchmark_height_); + benchmark_width_, tmp_pixels_c_b, benchmark_width_, + dst_pixels_c, benchmark_width_ * 3, benchmark_width_, + benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { @@ -3446,8 +3451,8 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) { for (int i = 0; i < benchmark_iterations_; ++i) { MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, - benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0, - dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, + benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, + 0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } @@ -3502,8 +3507,8 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { for (int i = 0; i < benchmark_iterations_; ++i) { SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, benchmark_width_, - tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_, - benchmark_height_); + tmp_pixels_opt_b, benchmark_width_, NULL, 0, + benchmark_width_, benchmark_height_); } MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 8959addde..66fd4cf31 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -320,16 +320,16 @@ TEST_FACTOR(3, 1, 3) #ifndef DISABLE_SLOW_TESTS // Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(, name, width, height, None, 0) \ - TEST_SCALETO1(, name, width, height, Linear, 3) \ +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(, name, width, height, None, 0) \ + TEST_SCALETO1(, name, width, height, Linear, 3) \ TEST_SCALETO1(, name, width, height, Bilinear, 3) \ TEST_SCALETO1(, name, width, height, Box, 3) #else #if defined(ENABLE_FULL_TESTS) -#define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ - TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ + TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Box, 3) #else diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 088cd29a9..fd8fff802 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -1058,7 +1058,7 @@ TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 1280, 720) TEST_SCALETO(Scale, 1920, 1080) TEST_SCALETO(Scale, 1080, 1920) // for rotated phones -#endif // DISABLE_SLOW_TESTS +#endif // DISABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO