M2T2 Unpack fixes

Fix the algorithm for unpacking the lower 2 bits of M2T2 pixels.

Bug: b:258474032
Change-Id: Iea1d63f26e3f127a70ead26bc04ea3d939e793e3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4337978
Commit-Queue: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Justin Green 2023-03-14 10:23:17 -04:00 committed by libyuv LUCI CQ
parent f9b23b9cc0
commit 76468711d5
5 changed files with 117 additions and 107 deletions

View File

@ -56,10 +56,6 @@ extern "C" {
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSEWX8_NEON
#define HAS_TRANSPOSEUVWX8_NEON
#endif
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE4X4_32_NEON
#endif

View File

@ -410,6 +410,47 @@ void TransposeUVWx8_NEON(const uint8_t* src,
: "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
}
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src1 = src + src_stride;
const uint8_t* src2 = src1 + src_stride;
const uint8_t* src3 = src2 + src_stride;
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
"subs %8, %8, #4 \n" // w -= 4
"vst1.8 {q0}, [%4]! \n"
"vst1.8 {q1}, [%5]! \n"
"vst1.8 {q2}, [%6]! \n"
"vst1.8 {q3}, [%7]! \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(dst1), // %5
"+r"(dst2), // %6
"+r"(dst3), // %7
"+r"(width) // %8
: "r"((ptrdiff_t)(src_stride * 4)) // %9
: "memory", "cc", "q0", "q1", "q2", "q3");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus

View File

@ -2868,24 +2868,21 @@ void DetileToYUY2_C(const uint8_t* src_y,
// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
// block contain all of the lower 2 bits of each pixel packed together, and the
// next 64 bytes represent all the upper 8 bits of the pixel.
// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
// order.
void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
for (size_t i = 0; i < size; i += 80) {
const uint8_t* src_lower_bits = src;
const uint8_t* src_upper_bits = src + 16;
for (int j = 0; j < 16; j++) {
uint8_t lower_bits = src_lower_bits[j];
*dst++ = (lower_bits & 0x03) << 6 | (uint16_t)src_upper_bits[j * 4] << 8 |
(uint16_t)src_upper_bits[j * 4] >> 2;
*dst++ = (lower_bits & 0x0C) << 4 |
(uint16_t)src_upper_bits[j * 4 + 1] << 8 |
(uint16_t)src_upper_bits[j * 4 + 1] >> 2;
*dst++ = (lower_bits & 0x30) << 2 |
(uint16_t)src_upper_bits[j * 4 + 2] << 8 |
(uint16_t)src_upper_bits[j * 4 + 2] >> 2;
*dst++ = (lower_bits & 0xC0) | (uint16_t)src_upper_bits[j * 4 + 3] << 8 |
(uint16_t)src_upper_bits[j * 4 + 3] >> 2;
for (int j = 0; j < 4; j++) {
for (int k = 0; k < 16; k++) {
*dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
(uint16_t)*src_upper_bits << 8 |
(uint16_t)*src_upper_bits >> 2;
src_upper_bits++;
}
}
src += 80;
@ -4547,4 +4544,4 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv
#endif
#endif

View File

@ -721,57 +721,43 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
#endif
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
const uint8_t* src_lower_bits = src;
const uint8_t* src_upper_bits = src + 16;
asm volatile(
"1: \n"
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Load 32 bytes of upper
// bits.
"vld1.8 {d6}, [%0]! \n" // Load 8 bytes of lower
// bits.
"vshl.u8 d4, d6, #2 \n" // Align lower bits.
"vshl.u8 d2, d6, #4 \n"
"vshl.u8 d0, d6, #6 \n"
"vzip.u8 d0, d1 \n" // Zip lower and upper
// bits together.
"vzip.u8 d2, d3 \n"
"vzip.u8 d4, d5 \n"
"vzip.u8 d6, d7 \n"
"vsri.u16 q0, q0, #10 \n" // Copy upper 6 bits into
// lower 6 bits for better
// accuracy in
// conversions.
"vsri.u16 q1, q1, #10 \n"
"vsri.u16 q2, q2, #10 \n"
"vsri.u16 q3, q3, #10 \n"
"vst4.16 {d0, d2, d4, d6}, [%2]! \n" // Store 32 pixels
"vst4.16 {d1, d3, d5, d7}, [%2]! \n"
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Process last 32 pixels
// in the block
"vld1.8 {d6}, [%0]! \n"
"vshl.u8 d4, d6, #2 \n"
"vshl.u8 d2, d6, #4 \n"
"vshl.u8 d0, d6, #6 \n"
"vzip.u8 d0, d1 \n"
"vzip.u8 d2, d3 \n"
"vzip.u8 d4, d5 \n"
"vzip.u8 d6, d7 \n"
"vsri.u16 q0, q0, #10 \n"
"vsri.u16 q1, q1, #10 \n"
"vsri.u16 q2, q2, #10 \n"
"vsri.u16 q3, q3, #10 \n"
"vst4.16 {d0, d2, d4, d6}, [%2]! \n"
"vst4.16 {d1, d3, d5, d7}, [%2]! \n"
"mov %0, %1 \n"
"add %1, %0, #16 \n"
"subs %3, %3, #80 \n"
"bgt 1b \n"
: "+r"(src_lower_bits), // %0
"+r"(src_upper_bits), // %1
"+r"(dst), // %2
"+r"(size) // %3
"1: \n"
"vld1.8 q14, [%0]! \n" // Load lower bits.
"vld1.8 q9, [%0]! \n" // Load upper bits row
// by row.
"vld1.8 q11, [%0]! \n"
"vld1.8 q13, [%0]! \n"
"vld1.8 q15, [%0]! \n"
"vshl.u8 q8, q14, #6 \n" // Shift lower bit data
// appropriately.
"vshl.u8 q10, q14, #4 \n"
"vshl.u8 q12, q14, #2 \n"
"vzip.u8 q8, q9 \n" // Interleave upper and
// lower bits.
"vzip.u8 q10, q11 \n"
"vzip.u8 q12, q13 \n"
"vzip.u8 q14, q15 \n"
"vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits
// into lower 6 bits for
// better accuracy in
// conversions.
"vsri.u16 q9, q9, #10 \n"
"vsri.u16 q10, q10, #10 \n"
"vsri.u16 q11, q11, #10 \n"
"vsri.u16 q12, q12, #10 \n"
"vsri.u16 q13, q13, #10 \n"
"vsri.u16 q14, q14, #10 \n"
"vsri.u16 q15, q15, #10 \n"
"vstmia %1!, {q8-q15} \n" // Store pixel block (64
// pixels).
"subs %2, %2, #80 \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(size) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q3");
: "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
}
// Reads 16 U's and V's and writes out 16 pairs of UV.

View File

@ -752,49 +752,39 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
// Unpack MT2T into tiled P010 64 pixels at a time. See
// tinyurl.com/mtk-10bit-video-format for format documentation.
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
const uint8_t* src_lower_bits = src;
const uint8_t* src_upper_bits = src + 16;
asm volatile(
"1: \n"
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
"ld1 {v7.8b}, [%0], #8 \n"
"shl v6.8b, v7.8b, #2 \n"
"shl v5.8b, v7.8b, #4 \n"
"shl v4.8b, v7.8b, #6 \n"
"zip1 v0.16b, v4.16b, v0.16b \n"
"zip1 v1.16b, v5.16b, v1.16b \n"
"zip1 v2.16b, v6.16b, v2.16b \n"
"zip1 v3.16b, v7.16b, v3.16b \n"
"sri v0.8h, v0.8h, #10 \n"
"sri v1.8h, v1.8h, #10 \n"
"sri v2.8h, v2.8h, #10 \n"
"sri v3.8h, v3.8h, #10 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
"ld1 {v7.8b}, [%0], #8 \n"
"shl v6.8b, v7.8b, #2 \n"
"shl v5.8b, v7.8b, #4 \n"
"shl v4.8b, v7.8b, #6 \n"
"zip1 v0.16b, v4.16b, v0.16b \n"
"zip1 v1.16b, v5.16b, v1.16b \n"
"zip1 v2.16b, v6.16b, v2.16b \n"
"zip1 v3.16b, v7.16b, v3.16b \n"
"sri v0.8h, v0.8h, #10 \n"
"sri v1.8h, v1.8h, #10 \n"
"sri v2.8h, v2.8h, #10 \n"
"sri v3.8h, v3.8h, #10 \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
"mov %0, %1 \n"
"add %1, %0, #16 \n"
"subs %3, %3, #80 \n"
"b.gt 1b \n"
: "+r"(src_lower_bits), // %0
"+r"(src_upper_bits), // %1
"+r"(dst), // %2
"+r"(size) // %3
"1: \n"
"ld1 {v7.16b}, [%0], #16 \n"
"ld1 {v0.16b-v3.16b}, [%0], #64 \n"
"shl v4.16b, v7.16b, #6 \n"
"shl v5.16b, v7.16b, #4 \n"
"shl v6.16b, v7.16b, #2 \n"
"subs %2, %2, #80 \n"
"zip1 v16.16b, v4.16b, v0.16b \n"
"zip1 v18.16b, v5.16b, v1.16b \n"
"zip1 v20.16b, v6.16b, v2.16b \n"
"zip1 v22.16b, v7.16b, v3.16b \n"
"zip2 v17.16b, v4.16b, v0.16b \n"
"zip2 v19.16b, v5.16b, v1.16b \n"
"zip2 v21.16b, v6.16b, v2.16b \n"
"zip2 v23.16b, v7.16b, v3.16b \n"
"sri v16.8h, v16.8h, #10 \n"
"sri v17.8h, v17.8h, #10 \n"
"sri v18.8h, v18.8h, #10 \n"
"sri v19.8h, v19.8h, #10 \n"
"st1 {v16.8h-v19.8h}, [%1], #64 \n"
"sri v20.8h, v20.8h, #10 \n"
"sri v21.8h, v21.8h, #10 \n"
"sri v22.8h, v22.8h, #10 \n"
"sri v23.8h, v23.8h, #10 \n"
"st1 {v20.8h-v23.8h}, [%1], #64 \n"
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(size) // %2
:
: "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12");
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
}
#if LIBYUV_USE_ST2