diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index 3e6a2fef2..69318e03f 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -57,7 +57,11 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#if defined(__aarch64__) +#define HAS_TRANSPOSEWX16_NEON +#else #define HAS_TRANSPOSEWX8_NEON +#endif #define HAS_TRANSPOSEUVWX8_NEON #define HAS_TRANSPOSE4X4_32_NEON #endif @@ -94,6 +98,11 @@ void TransposeWx8_NEON(const uint8_t* src, uint8_t* dst, int dst_stride, int width); +void TransposeWx16_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, @@ -120,6 +129,11 @@ void TransposeWx8_Any_NEON(const uint8_t* src, uint8_t* dst, int dst_stride, int width); +void TransposeWx16_Any_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); void TransposeWx8_Any_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, diff --git a/source/rotate.cc b/source/rotate.cc index 3f8332c34..497458e1c 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -31,7 +31,8 @@ void TransposePlane(const uint8_t* src, int width, int height) { int i = height; -#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) +#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) || \ + defined(HAS_TRANSPOSEWX16_NEON) void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx16_C; #else @@ -47,6 +48,14 @@ void TransposePlane(const uint8_t* src, } } #endif +#if defined(HAS_TRANSPOSEWX16_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + TransposeWx16 = TransposeWx16_Any_NEON; + if (IS_ALIGNED(width, 16)) { + TransposeWx16 = TransposeWx16_NEON; + } + } +#endif #if defined(HAS_TRANSPOSEWX8_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { TransposeWx8 = TransposeWx8_Any_SSSE3; @@ -80,7 +89,8 @@ void TransposePlane(const uint8_t* src, } #endif -#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) +#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX) || \ + defined(HAS_TRANSPOSEWX16_NEON) // Work across the source in 16x16 tiles while (i >= 16) { TransposeWx16(src, src_stride, dst, dst_stride, width); diff --git a/source/rotate_any.cc b/source/rotate_any.cc index 88ca78765..949a7f7a1 100644 --- a/source/rotate_any.cc +++ b/source/rotate_any.cc @@ -32,6 +32,9 @@ extern "C" { #ifdef HAS_TRANSPOSEWX8_NEON TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) #endif +#ifdef HAS_TRANSPOSEWX16_NEON +TANY(TransposeWx16_Any_NEON, TransposeWx16_NEON, 15) +#endif #ifdef HAS_TRANSPOSEWX8_SSSE3 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #endif diff --git a/source/rotate_common.cc b/source/rotate_common.cc index e72608e9a..e0341fec4 100644 --- a/source/rotate_common.cc +++ b/source/rotate_common.cc @@ -36,6 +36,16 @@ void TransposeWx8_C(const uint8_t* src, } } +void TransposeWx16_C(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + TransposeWx8_C(src, src_stride, dst, dst_stride, width); + TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, + width); +} + void TransposeUVWx8_C(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/source/rotate_lsx.cc b/source/rotate_lsx.cc index 94a2b91cd..b292803a0 100644 --- a/source/rotate_lsx.cc +++ b/source/rotate_lsx.cc @@ -61,16 +61,6 @@ extern "C" { _dst += _stride2; \ } -void TransposeWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - TransposeWx8_C(src, src_stride, dst, dst_stride, width); - TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, - width); -} - void TransposeUVWx16_C(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/source/rotate_msa.cc b/source/rotate_msa.cc index 99bdca65b..d4e62b12e 100644 --- a/source/rotate_msa.cc +++ b/source/rotate_msa.cc @@ -51,16 +51,6 @@ extern "C" { out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ } -void TransposeWx16_C(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { - TransposeWx8_C(src, src_stride, dst, dst_stride, width); - TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, - width); -} - void TransposeUVWx16_C(const uint8_t* src, int src_stride, uint8_t* dst_a, diff --git a/source/rotate_neon64.cc b/source/rotate_neon64.cc index 95047fa7a..08b11a5a9 100644 --- a/source/rotate_neon64.cc +++ b/source/rotate_neon64.cc @@ -21,195 +21,136 @@ extern "C" { // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) -static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, - 2, 6, 10, 14, 3, 7, 11, 15}; - -void TransposeWx8_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width) { +void TransposeWx16_NEON(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { const uint8_t* src_temp; - asm volatile( - // loops are on blocks of 8. loop will stop when - // counter gets to or below 0. starting the counter - // at w-8 allow for this - "sub %w3, %w3, #8 \n" + asm("1: \n" + "mov %[src_temp], %[src] \n" - // handle 8x8 blocks. this should be the majority of the plane - "1: \n" - "mov %0, %1 \n" + "ld1 {v0.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v1.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v2.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v3.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v4.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v5.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v6.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v7.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v8.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v9.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v10.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v11.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v12.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v13.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v14.16b}, [%[src_temp]], %[src_stride] \n" + "ld1 {v15.16b}, [%[src_temp]], %[src_stride] \n" - "ld1 {v0.8b}, [%0], %5 \n" - "ld1 {v1.8b}, [%0], %5 \n" - "ld1 {v2.8b}, [%0], %5 \n" - "ld1 {v3.8b}, [%0], %5 \n" - "ld1 {v4.8b}, [%0], %5 \n" - "ld1 {v5.8b}, [%0], %5 \n" - "ld1 {v6.8b}, [%0], %5 \n" - "ld1 {v7.8b}, [%0] \n" - "mov %0, %1 \n" + "add %[src], %[src], #16 \n" - "trn2 v16.8b, v0.8b, v1.8b \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "trn1 v17.8b, v0.8b, v1.8b \n" - "add %0, %0, %5 \n" - "trn2 v18.8b, v2.8b, v3.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 1 - "trn1 v19.8b, v2.8b, v3.8b \n" - "add %0, %0, %5 \n" - "trn2 v20.8b, v4.8b, v5.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 2 - "trn1 v21.8b, v4.8b, v5.8b \n" - "add %0, %0, %5 \n" - "trn2 v22.8b, v6.8b, v7.8b \n" - "prfm pldl1keep, [%0, 448] \n" // row 3 - "trn1 v23.8b, v6.8b, v7.8b \n" - "add %0, %0, %5 \n" + // Transpose 8x8-byte blocks. + "trn1 v16.2d, v0.2d, v8.2d \n" + "trn1 v17.2d, v1.2d, v9.2d \n" + "trn1 v18.2d, v2.2d, v10.2d \n" + "trn1 v19.2d, v3.2d, v11.2d \n" + "trn1 v20.2d, v4.2d, v12.2d \n" + "trn1 v21.2d, v5.2d, v13.2d \n" + "trn1 v22.2d, v6.2d, v14.2d \n" + "trn1 v23.2d, v7.2d, v15.2d \n" + "trn2 v24.2d, v0.2d, v8.2d \n" + "trn2 v25.2d, v1.2d, v9.2d \n" + "trn2 v26.2d, v2.2d, v10.2d \n" + "trn2 v27.2d, v3.2d, v11.2d \n" + "trn2 v28.2d, v4.2d, v12.2d \n" + "trn2 v29.2d, v5.2d, v13.2d \n" + "trn2 v30.2d, v6.2d, v14.2d \n" + "trn2 v31.2d, v7.2d, v15.2d \n" - "trn2 v3.4h, v17.4h, v19.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 4 - "trn1 v1.4h, v17.4h, v19.4h \n" - "add %0, %0, %5 \n" - "trn2 v2.4h, v16.4h, v18.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 5 - "trn1 v0.4h, v16.4h, v18.4h \n" - "add %0, %0, %5 \n" - "trn2 v7.4h, v21.4h, v23.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 6 - "trn1 v5.4h, v21.4h, v23.4h \n" - "add %0, %0, %5 \n" - "trn2 v6.4h, v20.4h, v22.4h \n" - "prfm pldl1keep, [%0, 448] \n" // row 7 - "trn1 v4.4h, v20.4h, v22.4h \n" + "subs %w[width], %w[width], #16 \n" - "trn2 v21.2s, v1.2s, v5.2s \n" - "trn1 v17.2s, v1.2s, v5.2s \n" - "trn2 v20.2s, v0.2s, v4.2s \n" - "trn1 v16.2s, v0.2s, v4.2s \n" - "trn2 v23.2s, v3.2s, v7.2s \n" - "trn1 v19.2s, v3.2s, v7.2s \n" - "trn2 v22.2s, v2.2s, v6.2s \n" - "trn1 v18.2s, v2.2s, v6.2s \n" + // Transpose 4x4-byte blocks within each 8x8 block. + "trn1 v0.4s, v16.4s, v20.4s \n" + "trn1 v1.4s, v17.4s, v21.4s \n" + "trn1 v2.4s, v18.4s, v22.4s \n" + "trn1 v3.4s, v19.4s, v23.4s \n" + "trn2 v4.4s, v16.4s, v20.4s \n" + "trn2 v5.4s, v17.4s, v21.4s \n" + "trn2 v6.4s, v18.4s, v22.4s \n" + "trn2 v7.4s, v19.4s, v23.4s \n" + "trn1 v8.4s, v24.4s, v28.4s \n" + "trn1 v9.4s, v25.4s, v29.4s \n" + "trn1 v10.4s, v26.4s, v30.4s \n" + "trn1 v11.4s, v27.4s, v31.4s \n" + "trn2 v12.4s, v24.4s, v28.4s \n" + "trn2 v13.4s, v25.4s, v29.4s \n" + "trn2 v14.4s, v26.4s, v30.4s \n" + "trn2 v15.4s, v27.4s, v31.4s \n" - "mov %0, %2 \n" + // Transpose 2x2-byte blocks within each 4x4 block. + "trn1 v16.8h, v0.8h, v2.8h \n" + "trn1 v17.8h, v1.8h, v3.8h \n" + "trn2 v18.8h, v0.8h, v2.8h \n" + "trn2 v19.8h, v1.8h, v3.8h \n" + "trn1 v20.8h, v4.8h, v6.8h \n" + "trn1 v21.8h, v5.8h, v7.8h \n" + "trn2 v22.8h, v4.8h, v6.8h \n" + "trn2 v23.8h, v5.8h, v7.8h \n" + "trn1 v24.8h, v8.8h, v10.8h \n" + "trn1 v25.8h, v9.8h, v11.8h \n" + "trn2 v26.8h, v8.8h, v10.8h \n" + "trn2 v27.8h, v9.8h, v11.8h \n" + "trn1 v28.8h, v12.8h, v14.8h \n" + "trn1 v29.8h, v13.8h, v15.8h \n" + "trn2 v30.8h, v12.8h, v14.8h \n" + "trn2 v31.8h, v13.8h, v15.8h \n" - "st1 {v17.8b}, [%0], %6 \n" - "st1 {v16.8b}, [%0], %6 \n" - "st1 {v19.8b}, [%0], %6 \n" - "st1 {v18.8b}, [%0], %6 \n" - "st1 {v21.8b}, [%0], %6 \n" - "st1 {v20.8b}, [%0], %6 \n" - "st1 {v23.8b}, [%0], %6 \n" - "st1 {v22.8b}, [%0] \n" + // Transpose bytes within each 2x2 block. + "trn1 v0.16b, v16.16b, v17.16b \n" + "trn2 v1.16b, v16.16b, v17.16b \n" + "trn1 v2.16b, v18.16b, v19.16b \n" + "trn2 v3.16b, v18.16b, v19.16b \n" + "trn1 v4.16b, v20.16b, v21.16b \n" + "trn2 v5.16b, v20.16b, v21.16b \n" + "trn1 v6.16b, v22.16b, v23.16b \n" + "trn2 v7.16b, v22.16b, v23.16b \n" + "trn1 v8.16b, v24.16b, v25.16b \n" + "trn2 v9.16b, v24.16b, v25.16b \n" + "trn1 v10.16b, v26.16b, v27.16b \n" + "trn2 v11.16b, v26.16b, v27.16b \n" + "trn1 v12.16b, v28.16b, v29.16b \n" + "trn2 v13.16b, v28.16b, v29.16b \n" + "trn1 v14.16b, v30.16b, v31.16b \n" + "trn2 v15.16b, v30.16b, v31.16b \n" - "add %1, %1, #8 \n" // src += 8 - "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride - "subs %w3, %w3, #8 \n" // w -= 8 - "b.ge 1b \n" + "st1 {v0.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v1.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v2.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v3.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v4.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v5.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v6.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v7.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v8.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v9.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v10.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v11.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v12.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v13.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v14.16b}, [%[dst]], %[dst_stride] \n" + "st1 {v15.16b}, [%[dst]], %[dst_stride] \n" - // add 8 back to counter. if the result is 0 there are - // no residuals. - "adds %w3, %w3, #8 \n" - "b.eq 4f \n" - - // some residual, so between 1 and 7 lines left to transpose - "cmp %w3, #2 \n" - "b.lt 3f \n" - - "cmp %w3, #4 \n" - "b.lt 2f \n" - - // 4x8 block - "mov %0, %1 \n" - "ld1 {v0.s}[0], [%0], %5 \n" - "ld1 {v0.s}[1], [%0], %5 \n" - "ld1 {v0.s}[2], [%0], %5 \n" - "ld1 {v0.s}[3], [%0], %5 \n" - "ld1 {v1.s}[0], [%0], %5 \n" - "ld1 {v1.s}[1], [%0], %5 \n" - "ld1 {v1.s}[2], [%0], %5 \n" - "ld1 {v1.s}[3], [%0] \n" - - "mov %0, %2 \n" - - "ld1 {v2.16b}, [%4] \n" - - "tbl v3.16b, {v0.16b}, v2.16b \n" - "tbl v0.16b, {v1.16b}, v2.16b \n" - - // TODO(frkoenig): Rework shuffle above to - // write out with 4 instead of 8 writes. - "st1 {v3.s}[0], [%0], %6 \n" - "st1 {v3.s}[1], [%0], %6 \n" - "st1 {v3.s}[2], [%0], %6 \n" - "st1 {v3.s}[3], [%0] \n" - - "add %0, %2, #4 \n" - "st1 {v0.s}[0], [%0], %6 \n" - "st1 {v0.s}[1], [%0], %6 \n" - "st1 {v0.s}[2], [%0], %6 \n" - "st1 {v0.s}[3], [%0] \n" - - "add %1, %1, #4 \n" // src += 4 - "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride - "subs %w3, %w3, #4 \n" // w -= 4 - "b.eq 4f \n" - - // some residual, check to see if it includes a 2x8 block, - // or less - "cmp %w3, #2 \n" - "b.lt 3f \n" - - // 2x8 block - "2: \n" - "mov %0, %1 \n" - "ld1 {v0.h}[0], [%0], %5 \n" - "ld1 {v1.h}[0], [%0], %5 \n" - "ld1 {v0.h}[1], [%0], %5 \n" - "ld1 {v1.h}[1], [%0], %5 \n" - "ld1 {v0.h}[2], [%0], %5 \n" - "ld1 {v1.h}[2], [%0], %5 \n" - "ld1 {v0.h}[3], [%0], %5 \n" - "ld1 {v1.h}[3], [%0] \n" - - "trn2 v2.8b, v0.8b, v1.8b \n" - "trn1 v3.8b, v0.8b, v1.8b \n" - - "mov %0, %2 \n" - - "st1 {v3.8b}, [%0], %6 \n" - "st1 {v2.8b}, [%0] \n" - - "add %1, %1, #2 \n" // src += 2 - "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride - "subs %w3, %w3, #2 \n" // w -= 2 - "b.eq 4f \n" - - // 1x8 block - "3: \n" - "ld1 {v0.b}[0], [%1], %5 \n" - "ld1 {v0.b}[1], [%1], %5 \n" - "ld1 {v0.b}[2], [%1], %5 \n" - "ld1 {v0.b}[3], [%1], %5 \n" - "ld1 {v0.b}[4], [%1], %5 \n" - "ld1 {v0.b}[5], [%1], %5 \n" - "ld1 {v0.b}[6], [%1], %5 \n" - "ld1 {v0.b}[7], [%1] \n" - - "st1 {v0.8b}, [%2] \n" - - "4: \n" - - : "=&r"(src_temp), // %0 - "+r"(src), // %1 - "+r"(dst), // %2 - "+r"(width) // %3 - : "r"(&kVTbl4x4Transpose), // %4 - "r"((ptrdiff_t)src_stride), // %5 - "r"((ptrdiff_t)dst_stride) // %6 - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23"); + "b.gt 1b \n" + : [src] "+r"(src), // %[src] + [src_temp] "=&r"(src_temp), // %[src_temp] + [dst] "+r"(dst), // %[dst] + [width] "+r"(width) // %[width] + : [src_stride] "r"((ptrdiff_t)src_stride), // %[src_stride] + [dst_stride] "r"((ptrdiff_t)dst_stride) // %[dst_stride] + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31"); } static const uint8_t kVTbl4x4TransposeDi[32] = {