diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 2c01f5c1e..5dfa71edc 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -29,7 +29,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { asm volatile( "vmov.u16 q4, #0 \n" // accumulator - "1: \n" + "1: \n" "vld1.8 {q0, q1}, [%0]! \n" "vld1.8 {q2, q3}, [%1]! \n" "veor.32 q0, q0, q2 \n" @@ -60,7 +60,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "vmov.u8 q9, #0 \n" "vmov.u8 q11, #0 \n" - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q1}, [%1]! \n" "subs %2, %2, #16 \n" diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index bedb8d1b0..ddf98fa68 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -27,7 +27,7 @@ uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) { asm volatile( "movi v4.8h, #0 \n" - "1: \n" + "1: \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n" "eor v0.16b, v0.16b, v2.16b \n" @@ -55,7 +55,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "eor v17.16b, v17.16b, v17.16b \n" "eor v19.16b, v19.16b, v19.16b \n" - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" "ld1 {v1.16b}, [%1], #16 \n" "subs %w2, %w2, #16 \n" diff --git a/source/row_neon.cc b/source/row_neon.cc index fea3d0689..1af828622 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -115,7 +115,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d23, #255 \n" - "1: \n" READYUV444 YUVTORGB + "1: \n" READYUV444 YUVTORGB "subs %4, %4, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -141,7 +141,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d23, #255 \n" - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" @@ -167,7 +167,7 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %5, %5, #8 \n" "vld1.8 {d23}, [%3]! \n" "vst4.8 {d20, d21, d22, d23}, [%4]! \n" @@ -194,7 +194,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" "vmov.u8 d19, #255 \n" // d19 modified by // YUVTORGB @@ -221,7 +221,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" @@ -253,7 +253,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" ARGBTORGB565 "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "bgt 1b \n" @@ -287,7 +287,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB1555 "vst1.8 {q0}, [%3]! \n" // store 8 pixels @@ -325,7 +325,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, YUVTORGB_SETUP "vmov.u8 d4, #0x0f \n" // bits to clear with // vbic. - "1: \n" READYUV422 YUVTORGB + "1: \n" READYUV422 YUVTORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB4444 "vst1.8 {q0}, [%3]! \n" // store 8 pixels @@ -348,7 +348,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d23, #255 \n" - "1: \n" READYUV400 YUVTORGB + "1: \n" READYUV400 YUVTORGB "subs %2, %2, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" @@ -366,7 +366,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile( "vmov.u8 d23, #255 \n" - "1: \n" + "1: \n" "vld1.8 {d20}, [%0]! \n" "vmov d21, d20 \n" "vmov d22, d20 \n" @@ -385,23 +385,22 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV12 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV12 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } void NV21ToARGBRow_NEON(const uint8* src_y, @@ -409,23 +408,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READNV21 YUVTORGB - "subs %3, %3, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%2]! \n" - "bgt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_vu), // %1 - "+r"(dst_argb), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READNV21 YUVTORGB + "subs %3, %3, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } void NV12ToRGB565Row_NEON(const uint8* src_y, @@ -435,7 +433,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB + "1: \n" READNV12 YUVTORGB "subs %3, %3, #8 \n" ARGBTORGB565 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" @@ -455,44 +453,42 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READYUY2 YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_yuy2), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READYUY2 YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width) { - asm volatile( - YUVTORGB_SETUP - "vmov.u8 d23, #255 \n" - "1: \n" READUYVY YUVTORGB - "subs %2, %2, #8 \n" - "vst4.8 {d20, d21, d22, d23}, [%1]! \n" - "bgt 1b \n" - : "+r"(src_uyvy), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", - "q12", "q13", "q14", "q15"); + asm volatile(YUVTORGB_SETUP + "vmov.u8 d23, #255 \n" + "1: \n" READUYVY YUVTORGB + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. @@ -501,7 +497,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop "vst1.8 {q0}, [%1]! \n" // store U @@ -522,7 +518,7 @@ void MergeUVRow_NEON(const uint8* src_u, uint8* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop @@ -590,7 +586,7 @@ void MergeRGBRow_NEON(const uint8* src_r, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 @@ -607,7 +603,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile( "vdup.8 q0, %2 \n" // duplicate 16 bytes - "1: \n" + "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" @@ -621,7 +617,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) { void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile( "vdup.u32 q0, %2 \n" // duplicate 4 ints - "1: \n" + "1: \n" "subs %1, %1, #4 \n" // 4 pixels per loop "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" @@ -638,7 +634,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2 \n" "sub %0, #16 \n" - "1: \n" + "1: \n" "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #16 \n" // 16 pixels per loop. "vrev64.8 q0, q0 \n" @@ -662,7 +658,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, "add %0, %0, %3, lsl #1 \n" "sub %0, #16 \n" - "1: \n" + "1: \n" "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" @@ -684,7 +680,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "add %0, %0, %2, lsl #2 \n" "sub %0, #16 \n" - "1: \n" + "1: \n" "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #4 \n" // 4 pixels per loop. "vrev64.32 q0, q0 \n" @@ -701,7 +697,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. @@ -717,7 +713,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -733,7 +729,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -763,7 +759,7 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -809,7 +805,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -838,7 +834,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -854,7 +850,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of @@ -870,7 +866,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B @@ -886,7 +882,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. @@ -901,7 +897,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. @@ -919,7 +915,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.8 {d1}, [%1]! \n" // store 8 U. @@ -939,7 +935,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.8 {d0}, [%1]! \n" // store 8 U. @@ -961,7 +957,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_yuy2 - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. @@ -988,7 +984,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_uyvy - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. @@ -1015,7 +1011,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, int width) { asm volatile( "vld1.8 {q2}, [%3] \n" // shuffler - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels @@ -1036,7 +1032,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, uint8* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d3}, [%2]! \n" // load 8 Vs @@ -1058,7 +1054,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, uint8* dst_uyvy, int width) { asm volatile( - "1: \n" + "1: \n" "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d2}, [%2]! \n" // load 8 Vs @@ -1076,7 +1072,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 @@ -1095,13 +1091,14 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, int width) { asm volatile( "vdup.32 d2, %2 \n" // dither4 - "1: \n" + "1: \n" "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d20, d20, d2 \n" "vqadd.u8 d21, d21, d2 \n" - "vqadd.u8 d22, d22, d2 \n" ARGBTORGB565 - "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. + "vqadd.u8 d22, d22, d2 \n" // add for dither + ARGBTORGB565 + "vst1.8 {q0}, [%0]! \n" // store 8 RGB565. "bgt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 @@ -1114,12 +1111,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 - "vst1.8 {q0}, [%1]! \n" // store 8 pixels - // ARGB1555. + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB1555. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 @@ -1134,12 +1130,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, asm volatile( "vmov.u8 d4, #0x0f \n" // bits to clear with // vbic. - "1: \n" + "1: \n" "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 - "vst1.8 {q0}, [%1]! \n" // store 8 pixels - // ARGB4444. + "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 @@ -1154,7 +1149,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1173,7 +1168,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels "subs %2, %2, #16 \n" // 16 processed per loop @@ -1192,7 +1187,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1221,7 +1216,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1249,24 +1244,20 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, "q15"); } +// clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ - "vmul.s16 q8, " #QB \ - ", q10 \n" /* B */ \ - "vmls.s16 q8, " #QG \ - ", q11 \n" /* G */ \ - "vmls.s16 q8, " #QR \ - ", q12 \n" /* R */ \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, " #QR \ - ", q10 \n" /* R */ \ - "vmls.s16 q9, " #QG \ - ", q14 \n" /* G */ \ - "vmls.s16 q9, " #QB \ - ", q13 \n" /* B */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. void ARGBToUVRow_NEON(const uint8* src_argb, @@ -1282,7 +1273,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. @@ -1328,7 +1319,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. @@ -1373,7 +1364,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. @@ -1418,7 +1409,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. @@ -1463,7 +1454,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. @@ -1508,7 +1499,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. @@ -1553,7 +1544,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. @@ -1600,7 +1591,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. @@ -1666,7 +1657,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. @@ -1732,7 +1723,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. @@ -1789,7 +1780,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -1813,7 +1804,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -1837,7 +1828,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -1861,7 +1852,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // R @@ -1884,7 +1875,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // R @@ -1907,7 +1898,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // B @@ -1930,7 +1921,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -1953,7 +1944,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -1988,7 +1979,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "rsb %4, #256 \n" "vdup.8 d4, %4 \n" // General purpose row blend. - "1: \n" + "1: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" @@ -2003,7 +1994,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" @@ -2013,13 +2004,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 @@ -2038,7 +2029,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, "subs %3, #8 \n" "blt 89f \n" // Blend 8 pixels. - "8: \n" + "8: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -2056,12 +2047,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "bge 8b \n" - "89: \n" + "89: \n" "adds %3, #8-1 \n" "blt 99f \n" // Blend 1 pixels. - "1: \n" + "1: \n" "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. @@ -2093,7 +2084,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile( // Attenuate 8 pixels. - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q10, d0, d3 \n" // b * a @@ -2125,7 +2116,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "vmovl.u8 q0, d0 \n" // b (0 .. 255) @@ -2166,7 +2157,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q10, d20 \n" // b (0 .. 255) @@ -2198,7 +2189,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -2231,7 +2222,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d30, #50 \n" // BR coefficient - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d20 \n" // B to Sepia B @@ -2267,7 +2258,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. - "1: \n" + "1: \n" "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit @@ -2323,10 +2314,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB - // pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q0, d0, d1 \n" // multiply B "vmull.u8 q1, d2, d3 \n" // multiply G @@ -2338,7 +2328,6 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2354,16 +2343,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - // pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q1, q1, q3 \n" // add R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2379,16 +2366,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB - // pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q1, q1, q3 \n" // subtract R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2409,7 +2394,7 @@ void SobelRow_NEON(const uint8* src_sobelx, asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d1}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -2433,7 +2418,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, int width) { asm volatile( // 16 pixel loop. - "1: \n" + "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q1}, [%1]! \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. @@ -2460,7 +2445,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. @@ -2485,7 +2470,7 @@ void SobelXRow_NEON(const uint8* src_y0, uint8* dst_sobelx, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" "vsubl.u8 q0, d0, d1 \n" @@ -2523,7 +2508,7 @@ void SobelYRow_NEON(const uint8* src_y0, uint8* dst_sobely, int width) { asm volatile( - "1: \n" + "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" "vsubl.u8 q0, d0, d1 \n" @@ -2555,7 +2540,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { asm volatile( "vdup.32 q0, %3 \n" - "1: \n" + "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts "subs %2, %2, #8 \n" // 8 pixels per loop "vmovl.u16 q2, d2 \n" // 8 int's @@ -2580,7 +2565,7 @@ void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { asm volatile( "vdup.32 q0, %3 \n" - "1: \n" + "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts "subs %2, %2, #8 \n" // 8 pixels per loop "vmovl.u16 q2, d2 \n" // 8 int's diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 0cc762983..de17f8b73 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -273,7 +273,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB( + "1: \n" READYUV422 YUVTORGB( v22, v21, v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels @@ -310,7 +310,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, asm volatile( YUVTORGB_SETUP "movi v23.8b, #255 \n" - "1: \n" READYUV422 YUVTORGB( + "1: \n" READYUV422 YUVTORGB( v22, v21, v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels @@ -395,7 +395,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile( "movi v23.8b, #255 \n" - "1: \n" + "1: \n" "ld1 {v20.8b}, [%0], #8 \n" "orr v21.8b, v20.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n" @@ -470,7 +470,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, int width) { asm volatile( YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB( + "1: \n" READNV12 YUVTORGB( v22, v21, v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels @@ -544,7 +544,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %w3, %w3, #16 \n" // 16 processed per loop "st1 {v0.16b}, [%1], #16 \n" // store U @@ -565,7 +565,7 @@ void MergeUVRow_NEON(const uint8* src_u, uint8* dst_uv, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %w3, %w3, #16 \n" // 16 processed per loop @@ -631,7 +631,7 @@ void MergeRGBRow_NEON(const uint8* src_r, // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 "subs %w2, %w2, #32 \n" // 32 processed per loop "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 @@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile( "dup v0.16b, %w2 \n" // duplicate 16 bytes - "1: \n" + "1: \n" "subs %w1, %w1, #16 \n" // 16 bytes per loop "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" @@ -661,7 +661,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) { void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile( "dup v0.4s, %w2 \n" // duplicate 4 ints - "1: \n" + "1: \n" "subs %w1, %w1, #4 \n" // 4 ints per loop "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" @@ -676,7 +676,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { // Start at end of source row. "add %0, %0, %w2, sxtw \n" "sub %0, %0, #16 \n" - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "subs %w2, %w2, #16 \n" // 16 pixels per loop. "rev64 v0.16b, v0.16b \n" @@ -698,7 +698,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, // Start at end of source row. "add %0, %0, %w3, sxtw #1 \n" "sub %0, %0, #16 \n" - "1: \n" + "1: \n" "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 "subs %w3, %w3, #8 \n" // 8 pixels per loop. "rev64 v0.8b, v0.8b \n" @@ -719,7 +719,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { // Start at end of source row. "add %0, %0, %w2, sxtw #2 \n" "sub %0, %0, #16 \n" - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 "subs %w2, %w2, #4 \n" // 4 pixels per loop. "rev64 v0.4s, v0.4s \n" @@ -736,11 +736,10 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { asm volatile( "movi v4.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %w2, %w2, #8 \n" // 8 processed per loop. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 @@ -753,7 +752,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { asm volatile( "movi v5.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g @@ -770,7 +769,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g @@ -801,12 +800,11 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { asm volatile( "movi v3.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 @@ -858,7 +856,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, int width) { asm volatile( "movi v3.8b, #255 \n" // Alpha - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -889,7 +887,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -906,9 +904,8 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of // RGB24. @@ -923,7 +920,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v4.8b, v2.8b, v2.8b \n" // mov g @@ -940,7 +937,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %w2, %w2, #16 \n" // 16 processed per loop. "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. @@ -955,7 +952,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %w2, %w2, #16 \n" // 16 processed per loop. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. @@ -973,9 +970,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - // pixels "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v3.8b}, [%2], #8 \n" // store 8 V. @@ -994,9 +990,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_v, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - // pixels "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v2.8b}, [%2], #8 \n" // store 8 V. @@ -1017,7 +1012,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int width) { const uint8* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row @@ -1044,7 +1039,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int width) { const uint8* src_uyvyb = src_uyvy + stride_uyvy; asm volatile( - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row @@ -1071,7 +1066,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, int width) { asm volatile( "ld1 {v2.16b}, [%3] \n" // shuffler - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "subs %w2, %w2, #4 \n" // 4 processed per loop "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels @@ -1091,7 +1086,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, uint8* dst_yuy2, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "orr v2.8b, v1.8b, v1.8b \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us @@ -1114,7 +1109,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, uint8* dst_uyvy, int width) { asm volatile( - "1: \n" + "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "orr v3.8b, v2.8b, v2.8b \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us @@ -1133,7 +1128,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTORGB565 @@ -1152,7 +1147,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, int width) { asm volatile( "dup v1.4s, %w2 \n" // dither4 - "1: \n" + "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v20.8b, v20.8b, v1.8b \n" @@ -1171,7 +1166,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 @@ -1191,7 +1186,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, asm volatile( "movi v4.16b, #0x0f \n" // bits to clear with // vbic. - "1: \n" + "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 @@ -1211,9 +1206,8 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -1231,7 +1225,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { asm volatile( - "1: \n" + "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 // pixels "subs %w2, %w2, #16 \n" // 16 processed per loop @@ -1250,9 +1244,8 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { "movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient "movi v6.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -1280,7 +1273,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, "movi v27.8b, #18 \n" // VB -0.1406 coefficient "movi v28.8b, #94 \n" // VG -0.7344 coefficient "movi v29.16b,#0x80 \n" // 128.5 - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB // pixels. "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -1318,23 +1311,19 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// clang-format off #define RGBTOUV(QB, QG, QR) \ - "mul v3.8h, " #QB \ - ",v20.8h \n" /* B */ \ - "mul v4.8h, " #QR \ - ",v20.8h \n" /* R */ \ - "mls v3.8h, " #QG \ - ",v21.8h \n" /* G */ \ - "mls v4.8h, " #QG \ - ",v24.8h \n" /* G */ \ - "mls v3.8h, " #QR \ - ",v22.8h \n" /* R */ \ - "mls v4.8h, " #QB \ - ",v23.8h \n" /* B */ \ + "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ + "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ + "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ + "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ + "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ + "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ +// clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. @@ -1626,9 +1615,8 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2 - "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in - // 16-bit) - "1: \n" + "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. @@ -1693,7 +1681,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. @@ -1758,7 +1746,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile( RGBTOUV_SETUP_REG - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. @@ -1822,7 +1810,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { "movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -1847,7 +1835,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -1871,7 +1859,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { "movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v27.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -1895,7 +1883,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R @@ -1918,7 +1906,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R @@ -1941,7 +1929,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B @@ -1964,7 +1952,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B @@ -1987,7 +1975,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { "movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v7.8b, #16 \n" // Add 16 constant - "1: \n" + "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B @@ -2022,7 +2010,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "dup v5.16b, %w4 \n" "dup v4.16b, %w5 \n" // General purpose row blend. - "1: \n" + "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" @@ -2037,7 +2025,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b 99f \n" // Blend 50 / 50. - "50: \n" + "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" @@ -2047,13 +2035,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b 99f \n" // Blend 100 / 0 - Copy row unchanged. - "100: \n" + "100: \n" "ld1 {v0.16b}, [%1], #16 \n" "subs %w3, %w3, #16 \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" - "99: \n" + "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_ptr1), // %2 @@ -2073,7 +2061,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, "subs %w3, %w3, #8 \n" "b.lt 89f \n" // Blend 8 pixels. - "8: \n" + "8: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 // pixels "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 @@ -2096,12 +2084,12 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, // pixels "b.ge 8b \n" - "89: \n" + "89: \n" "adds %w3, %w3, #8-1 \n" "b.lt 99f \n" // Blend 1 pixels. - "1: \n" + "1: \n" "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "subs %w3, %w3, #1 \n" // 1 processed per loop. @@ -2121,7 +2109,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. "b.ge 1b \n" - "99: \n" + "99: \n" : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 @@ -2136,9 +2124,8 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile( // Attenuate 8 pixels. - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v5.8h, v1.8b, v3.8b \n" // g * a @@ -2170,9 +2157,8 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. - "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of - // ARGB. + "1: \n" + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. "subs %w1, %w1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v1.8h, v1.8b \n" @@ -2190,7 +2176,6 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, "uqxtn v1.8b, v1.8h \n" "uqxtn v2.8b, v2.8h \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 @@ -2213,9 +2198,8 @@ void ARGBShadeRow_NEON(const uint8* src_argb, "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. - "1: \n" + "1: \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v5.8h, v5.8b \n" @@ -2230,7 +2214,6 @@ void ARGBShadeRow_NEON(const uint8* src_argb, "uqxtn v6.8b, v6.8h \n" "uqxtn v7.8b, v7.8h \n" "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -2247,9 +2230,8 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "movi v24.8b, #15 \n" // B * 0.11400 coefficient "movi v25.8b, #75 \n" // G * 0.58700 coefficient "movi v26.8b, #38 \n" // R * 0.29900 coefficient - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "umlal v4.8h, v1.8b, v25.8b \n" // G @@ -2282,7 +2264,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "movi v28.8b, #24 \n" // BB coefficient "movi v29.8b, #98 \n" // BG coefficient "movi v30.8b, #50 \n" // BR coefficient - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "subs %w1, %w1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B @@ -2318,9 +2300,8 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. - "1: \n" - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 - // pixels. + "1: \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v17.8h, v17.8b \n" // g @@ -2358,8 +2339,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A - "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 - // pixels. + "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -2377,11 +2357,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - // pixels. "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v1.8h, v1.8b, v5.8b \n" // multiply G @@ -2392,9 +2370,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" - : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2410,20 +2386,16 @@ void ARGBAddRow_NEON(const uint8* src_argb0, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - // pixels. "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" - : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2439,20 +2411,16 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, int width) { asm volatile( // 8 pixel loop. - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - // pixels. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - // pixels. "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" - : "+r"(src_argb0), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -2473,7 +2441,7 @@ void SobelRow_NEON(const uint8* src_sobelx, asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -2481,7 +2449,6 @@ void SobelRow_NEON(const uint8* src_sobelx, "orr v1.8b, v0.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -2498,7 +2465,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, int width) { asm volatile( // 16 pixel loop. - "1: \n" + "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "subs %w3, %w3, #16 \n" // 16 processed per loop. @@ -2525,13 +2492,12 @@ void SobelXYRow_NEON(const uint8* src_sobelx, asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. - "1: \n" + "1: \n" "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v1.8b, v0.8b, v2.8b \n" // add "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB - // pixels "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 @@ -2551,7 +2517,7 @@ void SobelXRow_NEON(const uint8* src_y0, uint8* dst_sobelx, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" "usubl v0.8h, v0.8b, v1.8b \n" @@ -2589,7 +2555,7 @@ void SobelYRow_NEON(const uint8* src_y0, uint8* dst_sobely, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v1.8b}, [%1],%4 \n" "usubl v0.8h, v0.8b, v1.8b \n" @@ -2620,7 +2586,7 @@ void SobelYRow_NEON(const uint8* src_y0, // Caveat - rounds float to half float whereas scaling version truncates. void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's @@ -2640,7 +2606,7 @@ void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) { void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) { asm volatile( - "1: \n" + "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's @@ -2680,7 +2646,6 @@ float ScaleMaxSamples_NEON(const float* src, "b.gt 1b \n" "fmax v5.4s, v5.4s, v6.4s \n" // max "fmaxv %s3, v5.4s \n" // signed max acculator - : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -2707,12 +2672,10 @@ float ScaleSumSamples_NEON(const float* src, "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v6.4s, v2.4s, v2.4s \n" "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples - "b.gt 1b \n" "faddp v5.4s, v5.4s, v6.4s \n" "faddp v5.4s, v5.4s, v5.4s \n" "faddp %3.4s, v5.4s, v5.4s \n" // sum - : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 @@ -2731,7 +2694,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { "fmul v2.4s, v2.4s, %3.s[0] \n" // scale "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" - : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -2768,7 +2730,6 @@ void GaussCol_NEON(const uint16* src0, "subs %w6, %w6, #8 \n" // 8 processed per loop "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "b.gt 1b \n" - : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 @@ -2807,7 +2768,6 @@ void GaussRow_NEON(const uint32* src, uint16* dst, int width) { "uqrshrn2 v0.8h, v1.4s, #8 \n" "st1 {v0.8h}, [%4], #16 \n" // store 8 samples "b.gt 1b \n" - : "+r"(src), // %0 "+r"(src1), // %1 "+r"(src2), // %2 diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index f480805bb..93fe67bf1 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -77,11 +77,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, "subs %w3, %w3, #16 \n" // 16 processed per loop "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v1.8h, v1.16b \n" - "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + - // row1 + "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent "uadalp v1.8h, v3.16b \n" - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and - // pack + "rshrn v0.8b, v0.8h, #2 \n" // round and pack "rshrn2 v0.16b, v1.8h, #2 \n" "st1 {v0.16b}, [%2], #16 \n" "b.gt 1b \n" @@ -101,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, (void)src_stride; asm volatile( "1: \n" - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 + "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #8 \n" // 8 processed per loop "st1 {v2.8b}, [%1], #8 \n" "b.gt 1b \n" @@ -230,7 +228,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, asm volatile( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" - "1: \n" + "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "subs %w2, %w2, #24 \n" @@ -279,7 +277,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, (void)src_stride; asm volatile( "ld1 {v3.16b}, [%3] \n" - "1: \n" + "1: \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #12 \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" @@ -394,8 +392,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "sqrdmulh v0.8h, v20.8h, v31.8h \n" "sqrdmulh v1.8h, v21.8h, v31.8h \n" - // Align for table lookup, vtbl requires registers to - // be adjacent + // Align for table lookup, vtbl requires registers to be adjacent "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "st1 {v3.8b}, [%1], #8 \n" @@ -776,8 +773,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. - "rshrn v0.8b, v0.8h, #2 \n" // downshift, round and - // pack + "rshrn v0.8b, v0.8h, #2 \n" // round and pack "rshrn v1.8b, v1.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n" @@ -827,8 +823,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, asm volatile( "add %1, %1, %0 \n" "1: \n" - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> - // 2x1 + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 "ld1 {v1.8b}, [%1], %4 \n" "ld1 {v2.8b}, [%0], %4 \n" "ld1 {v3.8b}, [%1], %4 \n" @@ -891,8 +886,7 @@ void ScaleARGBCols_NEON(uint8* dst_argb, LOAD1_DATA32_LANE(v1, 3) // clang-format on "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels - "subs %w2, %w2, #8 \n" // 8 processed per - // loop + "subs %w2, %w2, #8 \n" // 8 processed per loop "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1