diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index 5b251830c..0062d6746 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -27,120 +27,103 @@ void TransposeWx16_NEON(const uint8_t* src, int dst_stride, int width) { const uint8_t* src_temp; - asm("1: \n" - "mov %[src_temp], %[src] \n" + asm("1: \n" + "mov %[src_temp], %[src] \n" - "ld1 {v0.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v1.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v2.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v3.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v4.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v5.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v6.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v7.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v8.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v9.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v10.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v11.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v12.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v13.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v14.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v15.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v16.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v17.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v18.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v19.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v20.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v21.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v22.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v23.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v24.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v25.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v26.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v27.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v28.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v29.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v30.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v31.16b}, [%[src_temp]], %[src_stride] \n" - "add %[src], %[src], #16 \n" - - // Transpose 8x8-byte blocks. - "trn1 v16.2d, v0.2d, v8.2d \n" - "trn1 v17.2d, v1.2d, v9.2d \n" - "trn1 v18.2d, v2.2d, v10.2d \n" - "trn1 v19.2d, v3.2d, v11.2d \n" - "trn1 v20.2d, v4.2d, v12.2d \n" - "trn1 v21.2d, v5.2d, v13.2d \n" - "trn1 v22.2d, v6.2d, v14.2d \n" - "trn1 v23.2d, v7.2d, v15.2d \n" - "trn2 v24.2d, v0.2d, v8.2d \n" - "trn2 v25.2d, v1.2d, v9.2d \n" - "trn2 v26.2d, v2.2d, v10.2d \n" - "trn2 v27.2d, v3.2d, v11.2d \n" - "trn2 v28.2d, v4.2d, v12.2d \n" - "trn2 v29.2d, v5.2d, v13.2d \n" - "trn2 v30.2d, v6.2d, v14.2d \n" - "trn2 v31.2d, v7.2d, v15.2d \n" - - "subs %w[width], %w[width], #16 \n" - - // Transpose 4x4-byte blocks within each 8x8 block. - "trn1 v0.4s, v16.4s, v20.4s \n" - "trn1 v1.4s, v17.4s, v21.4s \n" - "trn1 v2.4s, v18.4s, v22.4s \n" - "trn1 v3.4s, v19.4s, v23.4s \n" - "trn2 v4.4s, v16.4s, v20.4s \n" - "trn2 v5.4s, v17.4s, v21.4s \n" - "trn2 v6.4s, v18.4s, v22.4s \n" - "trn2 v7.4s, v19.4s, v23.4s \n" - "trn1 v8.4s, v24.4s, v28.4s \n" - "trn1 v9.4s, v25.4s, v29.4s \n" - "trn1 v10.4s, v26.4s, v30.4s \n" - "trn1 v11.4s, v27.4s, v31.4s \n" - "trn2 v12.4s, v24.4s, v28.4s \n" - "trn2 v13.4s, v25.4s, v29.4s \n" - "trn2 v14.4s, v26.4s, v30.4s \n" - "trn2 v15.4s, v27.4s, v31.4s \n" - - // Transpose 2x2-byte blocks within each 4x4 block. - "trn1 v16.8h, v0.8h, v2.8h \n" - "trn1 v17.8h, v1.8h, v3.8h \n" - "trn2 v18.8h, v0.8h, v2.8h \n" - "trn2 v19.8h, v1.8h, v3.8h \n" - "trn1 v20.8h, v4.8h, v6.8h \n" - "trn1 v21.8h, v5.8h, v7.8h \n" - "trn2 v22.8h, v4.8h, v6.8h \n" - "trn2 v23.8h, v5.8h, v7.8h \n" - "trn1 v24.8h, v8.8h, v10.8h \n" - "trn1 v25.8h, v9.8h, v11.8h \n" - "trn2 v26.8h, v8.8h, v10.8h \n" - "trn2 v27.8h, v9.8h, v11.8h \n" - "trn1 v28.8h, v12.8h, v14.8h \n" - "trn1 v29.8h, v13.8h, v15.8h \n" - "trn2 v30.8h, v12.8h, v14.8h \n" - "trn2 v31.8h, v13.8h, v15.8h \n" + "add %[src], %[src], #16 \n" // Transpose bytes within each 2x2 block. - "trn1 v0.16b, v16.16b, v17.16b \n" - "trn2 v1.16b, v16.16b, v17.16b \n" - "trn1 v2.16b, v18.16b, v19.16b \n" - "trn2 v3.16b, v18.16b, v19.16b \n" - "trn1 v4.16b, v20.16b, v21.16b \n" - "trn2 v5.16b, v20.16b, v21.16b \n" - "trn1 v6.16b, v22.16b, v23.16b \n" - "trn2 v7.16b, v22.16b, v23.16b \n" - "trn1 v8.16b, v24.16b, v25.16b \n" - "trn2 v9.16b, v24.16b, v25.16b \n" - "trn1 v10.16b, v26.16b, v27.16b \n" - "trn2 v11.16b, v26.16b, v27.16b \n" - "trn1 v12.16b, v28.16b, v29.16b \n" - "trn2 v13.16b, v28.16b, v29.16b \n" - "trn1 v14.16b, v30.16b, v31.16b \n" - "trn2 v15.16b, v30.16b, v31.16b \n" + "trn1 v0.16b, v16.16b, v17.16b \n" + "trn2 v1.16b, v16.16b, v17.16b \n" + "trn1 v2.16b, v18.16b, v19.16b \n" + "trn2 v3.16b, v18.16b, v19.16b \n" + "trn1 v4.16b, v20.16b, v21.16b \n" + "trn2 v5.16b, v20.16b, v21.16b \n" + "trn1 v6.16b, v22.16b, v23.16b \n" + "trn2 v7.16b, v22.16b, v23.16b \n" + "trn1 v8.16b, v24.16b, v25.16b \n" + "trn2 v9.16b, v24.16b, v25.16b \n" + "trn1 v10.16b, v26.16b, v27.16b \n" + "trn2 v11.16b, v26.16b, v27.16b \n" + "trn1 v12.16b, v28.16b, v29.16b \n" + "trn2 v13.16b, v28.16b, v29.16b \n" + "trn1 v14.16b, v30.16b, v31.16b \n" + "trn2 v15.16b, v30.16b, v31.16b \n" - "st1 {v0.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v1.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v2.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v3.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v4.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v5.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v6.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v7.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v8.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v9.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v10.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v11.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v12.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v13.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v14.16b}, [%[dst]], %[dst_stride] \n" - "st1 {v15.16b}, [%[dst]], %[dst_stride] \n" + // Transpose 2x2-byte blocks within each 4x4 block. + "trn1 v16.8h, v0.8h, v2.8h \n" + "trn1 v17.8h, v1.8h, v3.8h \n" + "trn2 v18.8h, v0.8h, v2.8h \n" + "trn2 v19.8h, v1.8h, v3.8h \n" + "trn1 v20.8h, v4.8h, v6.8h \n" + "trn1 v21.8h, v5.8h, v7.8h \n" + "trn2 v22.8h, v4.8h, v6.8h \n" + "trn2 v23.8h, v5.8h, v7.8h \n" + "trn1 v24.8h, v8.8h, v10.8h \n" + "trn1 v25.8h, v9.8h, v11.8h \n" + "trn2 v26.8h, v8.8h, v10.8h \n" + "trn2 v27.8h, v9.8h, v11.8h \n" + "trn1 v28.8h, v12.8h, v14.8h \n" + "trn1 v29.8h, v13.8h, v15.8h \n" + "trn2 v30.8h, v12.8h, v14.8h \n" + "trn2 v31.8h, v13.8h, v15.8h \n" - "b.gt 1b \n" + "subs %w[width], %w[width], #16 \n" + + // Transpose 4x4-byte blocks within each 8x8 block. + "trn1 v0.4s, v16.4s, v20.4s \n" + "trn1 v2.4s, v17.4s, v21.4s \n" + "trn1 v4.4s, v18.4s, v22.4s \n" + "trn1 v6.4s, v19.4s, v23.4s \n" + "trn2 v8.4s, v16.4s, v20.4s \n" + "trn2 v10.4s, v17.4s, v21.4s \n" + "trn2 v12.4s, v18.4s, v22.4s \n" + "trn2 v14.4s, v19.4s, v23.4s \n" + "trn1 v1.4s, v24.4s, v28.4s \n" + "trn1 v3.4s, v25.4s, v29.4s \n" + "trn1 v5.4s, v26.4s, v30.4s \n" + "trn1 v7.4s, v27.4s, v31.4s \n" + "trn2 v9.4s, v24.4s, v28.4s \n" + "trn2 v11.4s, v25.4s, v29.4s \n" + "trn2 v13.4s, v26.4s, v30.4s \n" + "trn2 v15.4s, v27.4s, v31.4s \n" + + // Transpose 8x8-byte blocks and store. + "st2 {v0.d, v1.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v2.d, v3.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v4.d, v5.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v6.d, v7.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v8.d, v9.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v10.d, v11.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v12.d, v13.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v14.d, v15.d}[0], [%[dst]], %[dst_stride] \n" + "st2 {v0.d, v1.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v2.d, v3.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v4.d, v5.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v6.d, v7.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v8.d, v9.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v10.d, v11.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v12.d, v13.d}[1], [%[dst]], %[dst_stride] \n" + "st2 {v14.d, v15.d}[1], [%[dst]], %[dst_stride] \n" + + "b.gt 1b \n" : [src] "+r"(src), // %[src] [src_temp] "=&r"(src_temp), // %[src_temp] [dst] "+r"(dst), // %[dst]