diff --git a/source/row_sve.cc b/source/row_sve.cc index a4acb69a4..7251fe79d 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -217,9 +217,7 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, NV21ToRGB24Row_SVE_SC(src_y, src_vu, dst_rgb24, yuvconstants, width); } -// Dot-product constants are stored as four-tuples with the two innermost -// elements flipped to account for the interleaving nature of the widening -// addition instructions. +// SVE constants are stored negated such that we can store 128 in int8_t. // RGB to BT601 coefficients // UB 0.875 coefficient = 112 @@ -229,25 +227,24 @@ void NV21ToRGB24Row_SVE2(const uint8_t* src_y, // VG -0.7344 coefficient = -94 // VR 0.875 coefficient = 112 -// SVE constants are not negated -static const int16_t kARGBToUVCoefficients[] = { - // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 112, -38, -74, 0, -18, 112, -94, 0, +static const int8_t kARGBToUVCoefficients[] = { + // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 + -112, 74, 38, 0, 18, 94, -112, 0, }; -static const int16_t kRGBAToUVCoefficients[] = { - // 0, -UG, UB, -UR, 0, -VG, -VB, VR - 0, -74, 112, -38, 0, -94, -18, 112, +static const int8_t kABGRToUVCoefficients[] = { + // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 + 38, 74, -112, 0, -112, 94, 18, 0, }; -static const int16_t kBGRAToUVCoefficients[] = { - // 0, -UG, -UR, UB, 0, -VG, VR, -VB - 0, -74, -38, 112, 0, -94, 112, -18, +static const int8_t kBGRAToUVCoefficients[] = { + // 0, -UR, -UG, -UB, 0, -VR, -VG, -VB + 0, 38, 74, -112, 0, -112, 94, 18, }; -static const int16_t kABGRToUVCoefficients[] = { - // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -38, 112, -74, 0, 112, -18, -94, 0, +static const int8_t kRGBAToUVCoefficients[] = { + // 0, -UB, -UG, -UR, 0, -VB, -VG, -VR + 0, -112, 74, 38, 0, 18, 94, -112, }; // RGB to JPEG coefficients @@ -258,169 +255,138 @@ static const int16_t kABGRToUVCoefficients[] = { // VG -0.41869 coefficient = -107 // VR 0.500 coefficient = 128 -static const int16_t kARGBToUVJCoefficients[] = { - // UB, -UR, -UG, 0, -VB, VR, -VG, 0 - 128, -43, -85, 0, -21, 128, -107, 0, +static const int8_t kARGBToUVJCoefficients[] = { + // -UB, -UG, -UR, 0, -VB, -VG, -VR, 0 + -128, 85, 43, 0, 21, 107, -128, 0, }; -static const int16_t kABGRToUVJCoefficients[] = { - // -UR, UB, -UG, 0, VR, -VB, -VG, 0 - -43, 128, -85, 0, 128, -21, -107, 0, +static const int8_t kABGRToUVJCoefficients[] = { + // -UR, -UG, -UB, 0, -VR, -VG, -VB, 0 + 43, 85, -128, 0, -128, 107, 21, 0, }; +#define ABCDTOUVMATRIX_SVE \ + "ld1d {z0.d}, p1/z, [%[src0]] \n" /* ABCD(bgra) */ \ + "ld1d {z1.d}, p2/z, [%[src0], #1, mul vl] \n" /* EFGH(bgra) */ \ + "ld1d {z2.d}, p3/z, [%[src0], #2, mul vl] \n" /* IJKL(bgra) */ \ + "ld1d {z3.d}, p4/z, [%[src0], #3, mul vl] \n" /* MNOP(bgra) */ \ + "ld1d {z4.d}, p1/z, [%[src1]] \n" /* ABCD(bgra) */ \ + "ld1d {z5.d}, p2/z, [%[src1], #1, mul vl] \n" /* EFGH(bgra) */ \ + "ld1d {z6.d}, p3/z, [%[src1], #2, mul vl] \n" /* IJKL(bgra) */ \ + "ld1d {z7.d}, p4/z, [%[src1], #3, mul vl] \n" /* MNOP(bgra) */ \ + "incb %[src0], all, mul #4 \n" \ + "incb %[src1], all, mul #4 \n" \ + \ + "uaddlb z16.h, z0.b, z4.b \n" /* ABCD(br) */ \ + "uaddlb z18.h, z1.b, z5.b \n" /* EFGH(br) */ \ + "uaddlb z20.h, z2.b, z6.b \n" /* IJKL(br) */ \ + "uaddlb z22.h, z3.b, z7.b \n" /* MNOP(br) */ \ + "uaddlt z17.h, z0.b, z4.b \n" /* ABCD(ga) */ \ + "uaddlt z19.h, z1.b, z5.b \n" /* EFGH(ga) */ \ + "uaddlt z21.h, z2.b, z6.b \n" /* IJKL(ga) */ \ + "uaddlt z23.h, z3.b, z7.b \n" /* MNOP(ga) */ \ + \ + /* Use ADDP on 32-bit elements to add adjacent pairs of 9-bit unsigned */ \ + "addp z16.s, p0/m, z16.s, z18.s \n" /* ABEFCDGH(br) */ \ + "addp z17.s, p0/m, z17.s, z19.s \n" /* ABEFCDGH(ga) */ \ + "addp z20.s, p0/m, z20.s, z22.s \n" /* IJMNKLOP(br) */ \ + "addp z21.s, p0/m, z21.s, z23.s \n" /* IJMNKLOP(ga) */ \ + \ + "rshrnb z0.b, z16.h, #2 \n" /* ABEFCDGH(b0r0) */ \ + "rshrnb z1.b, z20.h, #2 \n" /* IJMNKLOP(b0r0) */ \ + "rshrnt z0.b, z17.h, #2 \n" /* ABEFCDGH(bgra) */ \ + "rshrnt z1.b, z21.h, #2 \n" /* IJMNKLOP(bgra) */ \ + \ + "tbl z0.s, {z0.s}, z27.s \n" /* ABCDEFGH */ \ + "tbl z1.s, {z1.s}, z27.s \n" /* IJKLMNOP */ \ + \ + "subs %w[width], %w[width], %w[vl], lsl #2 \n" /* VL per loop */ \ + \ + "movi v16.8h, #0 \n" \ + "movi v17.8h, #0 \n" \ + "movi v20.8h, #0 \n" \ + "movi v21.8h, #0 \n" \ + \ + "usdot z16.s, z0.b, z24.b \n" \ + "usdot z17.s, z1.b, z24.b \n" \ + "usdot z20.s, z0.b, z25.b \n" \ + "usdot z21.s, z1.b, z25.b \n" \ + \ + "subhnb z16.b, z26.h, z16.h \n" /* U */ \ + "subhnb z20.b, z26.h, z20.h \n" /* V */ \ + "subhnb z17.b, z26.h, z17.h \n" /* U */ \ + "subhnb z21.b, z26.h, z21.h \n" /* V */ \ + \ + "uzp1 z16.h, z16.h, z17.h \n" \ + "uzp1 z20.h, z20.h, z21.h \n" \ + \ + "st1b {z16.h}, p5, [%[dst_u]] \n" /* U */ \ + "st1b {z20.h}, p5, [%[dst_v]] \n" /* V */ \ + "inch %[dst_u] \n" \ + "inch %[dst_v] \n" + static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, - const int16_t* uvconstants) { + const int8_t* uvconstants) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; uint64_t vl; + asm("cntd %x0" : "=r"(vl)); + + // Width is a multiple of two here, so halve it. + width >>= 1; + asm volatile( - "ptrue p0.b \n" - "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" - "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" - "mov z26.h, #0x8000 \n" // 128.0 (0x8000) - "cntb %[vl] \n" - "subs %w[width], %w[width], %w[vl] \n" - "b.lt 2f \n" + "ptrue p0.b \n" + "ld1rw {z24.s}, p0/z, [%[uvconstants]] \n" + "ld1rw {z25.s}, p0/z, [%[uvconstants], #4] \n" + "mov z26.h, #0x8000 \n" // 128.0 (0x8000) - // Process 4x vectors from each input row per iteration. - // Cannot use predication here due to unrolling. - "1: \n" // e.g. - "ld1b {z0.b}, p0/z, [%[src0], #0, mul vl] \n" // bgrabgra - "ld1b {z4.b}, p0/z, [%[src1], #0, mul vl] \n" // bgrabgra - "ld1b {z1.b}, p0/z, [%[src0], #1, mul vl] \n" // bgrabgra - "ld1b {z5.b}, p0/z, [%[src1], #1, mul vl] \n" // bgrabgra - "ld1b {z2.b}, p0/z, [%[src0], #2, mul vl] \n" // bgrabgra - "ld1b {z6.b}, p0/z, [%[src1], #2, mul vl] \n" // bgrabgra - "ld1b {z3.b}, p0/z, [%[src0], #3, mul vl] \n" // bgrabgra - "ld1b {z7.b}, p0/z, [%[src1], #3, mul vl] \n" // bgrabgra - "incb %[src0], all, mul #4 \n" - "incb %[src1], all, mul #4 \n" + // Generate some TBL indices to undo the interleaving from ADDP. + "index z0.s, #0, #1 \n" + "index z1.s, #1, #1 \n" + "uzp1 z27.s, z0.s, z1.s \n" - "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr - "uaddlt z17.h, z0.b, z4.b \n" // gagagaga - "uaddlb z18.h, z1.b, z5.b \n" // brbrbrbr - "uaddlt z19.h, z1.b, z5.b \n" // gagagaga - "uaddlb z20.h, z2.b, z6.b \n" // brbrbrbr - "uaddlt z21.h, z2.b, z6.b \n" // gagagaga - "uaddlb z22.h, z3.b, z7.b \n" // brbrbrbr - "uaddlt z23.h, z3.b, z7.b \n" // gagagaga + "subs %w[width], %w[width], %w[vl], lsl #2 \n" + "b.lt 2f \n" - "trn1 z0.s, z16.s, z17.s \n" // brgabgra - "trn2 z1.s, z16.s, z17.s \n" // brgabgra - "trn1 z2.s, z18.s, z19.s \n" // brgabgra - "trn2 z3.s, z18.s, z19.s \n" // brgabgra - "trn1 z4.s, z20.s, z21.s \n" // brgabgra - "trn2 z5.s, z20.s, z21.s \n" // brgabgra - "trn1 z6.s, z22.s, z23.s \n" // brgabgra - "trn2 z7.s, z22.s, z23.s \n" // brgabgra + "ptrue p1.d \n" + "ptrue p2.d \n" + "ptrue p3.d \n" + "ptrue p4.d \n" + "ptrue p5.h \n" + "1: \n" // + ABCDTOUVMATRIX_SVE + "b.gt 1b \n" - "subs %w[width], %w[width], %w[vl] \n" // 4*VL per loop + "2: \n" + "adds %w[width], %w[width], %w[vl], lsl #2 \n" + "b.eq 99f \n" - "add z0.h, p0/m, z0.h, z1.h \n" // brgabrga - "add z2.h, p0/m, z2.h, z3.h \n" // brgabrga - "add z4.h, p0/m, z4.h, z5.h \n" // brgabrga - "add z6.h, p0/m, z6.h, z7.h \n" // brgabrga + "3: \n" + "whilelt p1.d, wzr, %w[width] \n" + "whilelt p2.d, %w[vl], %w[width] \n" + "whilelt p3.d, %w[vl2], %w[width] \n" + "whilelt p4.d, %w[vl3], %w[width] \n" + "whilelt p5.h, wzr, %w[width] \n" // + ABCDTOUVMATRIX_SVE + "b.gt 3b \n" - "urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga - "urshr z2.h, p0/m, z2.h, #2 \n" // brgabrga - "urshr z4.h, p0/m, z4.h, #2 \n" // brgabrga - "urshr z6.h, p0/m, z6.h, #2 \n" // brgabrga - - "movi v16.8h, #0 \n" - "movi v17.8h, #0 \n" - "movi v18.8h, #0 \n" - "movi v19.8h, #0 \n" - - "movi v20.8h, #0 \n" - "movi v21.8h, #0 \n" - "movi v22.8h, #0 \n" - "movi v23.8h, #0 \n" - - "sdot z16.d, z0.h, z24.h \n" // UUxxxxxx - "sdot z17.d, z2.h, z24.h \n" // UUxxxxxx - "sdot z18.d, z4.h, z24.h \n" // UUxxxxxx - "sdot z19.d, z6.h, z24.h \n" // UUxxxxxx - - "sdot z20.d, z0.h, z25.h \n" // VVxxxxxx - "sdot z21.d, z2.h, z25.h \n" // VVxxxxxx - "sdot z22.d, z4.h, z25.h \n" // VVxxxxxx - "sdot z23.d, z6.h, z25.h \n" // VVxxxxxx - - "uzp1 z16.s, z16.s, z17.s \n" // UUxx - "uzp1 z18.s, z18.s, z19.s \n" // UUxx - "uzp1 z20.s, z20.s, z21.s \n" // VVxx - "uzp1 z22.s, z22.s, z23.s \n" // VVxx - - "uzp1 z16.h, z16.h, z18.h \n" // UU - "uzp1 z20.h, z20.h, z22.h \n" // VV - - "addhnb z16.b, z16.h, z26.h \n" // U - "addhnb z20.b, z20.h, z26.h \n" // V - - "st1b {z16.h}, p0, [%[dst_u]] \n" // U - "st1b {z20.h}, p0, [%[dst_v]] \n" // V - "inch %[dst_u] \n" - "inch %[dst_v] \n" - - "b.ge 1b \n" - - "2: \n" - "adds %w[width], %w[width], %w[vl] \n" // VL per loop - "b.le 99f \n" - - // Process remaining pixels from each input row. - // Use predication to do one vector from each input array, so may loop up - // to three iterations. - "cntw %x[vl] \n" - - "3: \n" - "whilelt p1.s, wzr, %w[width] \n" - "ld1d {z0.d}, p1/z, [%[src0]] \n" // bgrabgra - "ld1d {z4.d}, p1/z, [%[src1]] \n" // bgrabgra - "incb %[src0] \n" - "incb %[src1] \n" - - "uaddlb z16.h, z0.b, z4.b \n" // brbrbrbr - "uaddlt z17.h, z0.b, z4.b \n" // gagagaga - - "trn1 z0.s, z16.s, z17.s \n" // brgabgra - "trn2 z1.s, z16.s, z17.s \n" // brgabgra - - "add z0.h, p0/m, z0.h, z1.h \n" // brgabrga - - "urshr z0.h, p0/m, z0.h, #2 \n" // brgabrga - - "subs %w[width], %w[width], %w[vl] \n" // VL per loop - - "movi v16.8h, #0 \n" - "movi v20.8h, #0 \n" - - "sdot z16.d, z0.h, z24.h \n" - "sdot z20.d, z0.h, z25.h \n" - - "addhnb z16.b, z16.h, z26.h \n" // U - "addhnb z20.b, z20.h, z26.h \n" // V - - "st1b {z16.d}, p0, [%[dst_u]] \n" // U - "st1b {z20.d}, p0, [%[dst_v]] \n" // V - "incd %[dst_u] \n" - "incd %[dst_v] \n" - "b.gt 3b \n" - - "99: \n" - : [src0] "+r"(src_argb), // %[src0] - [src1] "+r"(src_argb_1), // %[src1] - [dst_u] "+r"(dst_u), // %[dst_u] - [dst_v] "+r"(dst_v), // %[dst_v] - [width] "+r"(width), // %[width] - [vl] "=&r"(vl) // %[vl] - : [uvconstants] "r"(uvconstants) + "99: \n" + : [src0] "+r"(src_argb), // %[src0] + [src1] "+r"(src_argb_1), // %[src1] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [uvconstants] "r"(uvconstants), // %[uvconstants] + [vl] "r"(vl), // %[vl] + [vl2] "r"(vl * 2), // %[vl2] + [vl3] "r"(vl * 3) // %[vl3] : "cc", "memory", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", - "p0"); + "z27", "p0", "p1", "p2", "p3", "p4", "p5"); } void ARGBToUVRow_SVE2(const uint8_t* src_argb,