M2T2 Unpack fixes

Fix the algorithm for unpacking the lower 2 bits of M2T2 pixels.

Bug: b:258474032
Change-Id: Iea1d63f26e3f127a70ead26bc04ea3d939e793e3
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4337978
Commit-Queue: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Justin Green 2023-03-14 10:23:17 -04:00 committed by libyuv LUCI CQ
parent f9b23b9cc0
commit 76468711d5
5 changed files with 117 additions and 107 deletions

View File

@ -56,10 +56,6 @@ extern "C" {
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSEWX8_NEON #define HAS_TRANSPOSEWX8_NEON
#define HAS_TRANSPOSEUVWX8_NEON #define HAS_TRANSPOSEUVWX8_NEON
#endif
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE4X4_32_NEON #define HAS_TRANSPOSE4X4_32_NEON
#endif #endif

View File

@ -410,6 +410,47 @@ void TransposeUVWx8_NEON(const uint8_t* src,
: "r"(&kVTbl4x4TransposeDi) // %8 : "r"(&kVTbl4x4TransposeDi) // %8
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
} }
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
const uint8_t* src1 = src + src_stride;
const uint8_t* src2 = src1 + src_stride;
const uint8_t* src3 = src2 + src_stride;
uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride;
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
"subs %8, %8, #4 \n" // w -= 4
"vst1.8 {q0}, [%4]! \n"
"vst1.8 {q1}, [%5]! \n"
"vst1.8 {q2}, [%6]! \n"
"vst1.8 {q3}, [%7]! \n"
"bgt 1b \n"
: "+r"(src), // %0
"+r"(src1), // %1
"+r"(src2), // %2
"+r"(src3), // %3
"+r"(dst), // %4
"+r"(dst1), // %5
"+r"(dst2), // %6
"+r"(dst3), // %7
"+r"(width) // %8
: "r"((ptrdiff_t)(src_stride * 4)) // %9
: "memory", "cc", "q0", "q1", "q2", "q3");
}
#endif // defined(__ARM_NEON__) && !defined(__aarch64__) #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -2868,24 +2868,21 @@ void DetileToYUY2_C(const uint8_t* src_y,
// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded // Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the // in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
// block contain all of the lower 2 bits of each pixel packed together, and the // block contain all of the lower 2 bits of each pixel packed together, and the
// next 64 bytes represent all the upper 8 bits of the pixel. // next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
// order.
void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) { void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
for (size_t i = 0; i < size; i += 80) { for (size_t i = 0; i < size; i += 80) {
const uint8_t* src_lower_bits = src; const uint8_t* src_lower_bits = src;
const uint8_t* src_upper_bits = src + 16; const uint8_t* src_upper_bits = src + 16;
for (int j = 0; j < 16; j++) { for (int j = 0; j < 4; j++) {
uint8_t lower_bits = src_lower_bits[j]; for (int k = 0; k < 16; k++) {
*dst++ = (lower_bits & 0x03) << 6 | (uint16_t)src_upper_bits[j * 4] << 8 | *dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
(uint16_t)src_upper_bits[j * 4] >> 2; (uint16_t)*src_upper_bits << 8 |
*dst++ = (lower_bits & 0x0C) << 4 | (uint16_t)*src_upper_bits >> 2;
(uint16_t)src_upper_bits[j * 4 + 1] << 8 | src_upper_bits++;
(uint16_t)src_upper_bits[j * 4 + 1] >> 2; }
*dst++ = (lower_bits & 0x30) << 2 |
(uint16_t)src_upper_bits[j * 4 + 2] << 8 |
(uint16_t)src_upper_bits[j * 4 + 2] >> 2;
*dst++ = (lower_bits & 0xC0) | (uint16_t)src_upper_bits[j * 4 + 3] << 8 |
(uint16_t)src_upper_bits[j * 4 + 3] >> 2;
} }
src += 80; src += 80;
@ -4547,4 +4544,4 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv
#endif #endif

View File

@ -721,57 +721,43 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
#endif #endif
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
const uint8_t* src_lower_bits = src;
const uint8_t* src_upper_bits = src + 16;
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Load 32 bytes of upper "vld1.8 q14, [%0]! \n" // Load lower bits.
// bits. "vld1.8 q9, [%0]! \n" // Load upper bits row
"vld1.8 {d6}, [%0]! \n" // Load 8 bytes of lower // by row.
// bits. "vld1.8 q11, [%0]! \n"
"vshl.u8 d4, d6, #2 \n" // Align lower bits. "vld1.8 q13, [%0]! \n"
"vshl.u8 d2, d6, #4 \n" "vld1.8 q15, [%0]! \n"
"vshl.u8 d0, d6, #6 \n" "vshl.u8 q8, q14, #6 \n" // Shift lower bit data
"vzip.u8 d0, d1 \n" // Zip lower and upper // appropriately.
// bits together. "vshl.u8 q10, q14, #4 \n"
"vzip.u8 d2, d3 \n" "vshl.u8 q12, q14, #2 \n"
"vzip.u8 d4, d5 \n" "vzip.u8 q8, q9 \n" // Interleave upper and
"vzip.u8 d6, d7 \n" // lower bits.
"vsri.u16 q0, q0, #10 \n" // Copy upper 6 bits into "vzip.u8 q10, q11 \n"
// lower 6 bits for better "vzip.u8 q12, q13 \n"
// accuracy in "vzip.u8 q14, q15 \n"
// conversions. "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits
"vsri.u16 q1, q1, #10 \n" // into lower 6 bits for
"vsri.u16 q2, q2, #10 \n" // better accuracy in
"vsri.u16 q3, q3, #10 \n" // conversions.
"vst4.16 {d0, d2, d4, d6}, [%2]! \n" // Store 32 pixels "vsri.u16 q9, q9, #10 \n"
"vst4.16 {d1, d3, d5, d7}, [%2]! \n" "vsri.u16 q10, q10, #10 \n"
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Process last 32 pixels "vsri.u16 q11, q11, #10 \n"
// in the block "vsri.u16 q12, q12, #10 \n"
"vld1.8 {d6}, [%0]! \n" "vsri.u16 q13, q13, #10 \n"
"vshl.u8 d4, d6, #2 \n" "vsri.u16 q14, q14, #10 \n"
"vshl.u8 d2, d6, #4 \n" "vsri.u16 q15, q15, #10 \n"
"vshl.u8 d0, d6, #6 \n" "vstmia %1!, {q8-q15} \n" // Store pixel block (64
"vzip.u8 d0, d1 \n" // pixels).
"vzip.u8 d2, d3 \n" "subs %2, %2, #80 \n"
"vzip.u8 d4, d5 \n" "bgt 1b \n"
"vzip.u8 d6, d7 \n" : "+r"(src), // %0
"vsri.u16 q0, q0, #10 \n" "+r"(dst), // %1
"vsri.u16 q1, q1, #10 \n" "+r"(size) // %2
"vsri.u16 q2, q2, #10 \n"
"vsri.u16 q3, q3, #10 \n"
"vst4.16 {d0, d2, d4, d6}, [%2]! \n"
"vst4.16 {d1, d3, d5, d7}, [%2]! \n"
"mov %0, %1 \n"
"add %1, %0, #16 \n"
"subs %3, %3, #80 \n"
"bgt 1b \n"
: "+r"(src_lower_bits), // %0
"+r"(src_upper_bits), // %1
"+r"(dst), // %2
"+r"(size) // %3
: :
: "cc", "memory", "q0", "q1", "q2", "q3"); : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
} }
// Reads 16 U's and V's and writes out 16 pairs of UV. // Reads 16 U's and V's and writes out 16 pairs of UV.

View File

@ -752,49 +752,39 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
// Unpack MT2T into tiled P010 64 pixels at a time. See // Unpack MT2T into tiled P010 64 pixels at a time. See
// tinyurl.com/mtk-10bit-video-format for format documentation. // tinyurl.com/mtk-10bit-video-format for format documentation.
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
const uint8_t* src_lower_bits = src;
const uint8_t* src_upper_bits = src + 16;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n" "ld1 {v7.16b}, [%0], #16 \n"
"ld1 {v7.8b}, [%0], #8 \n" "ld1 {v0.16b-v3.16b}, [%0], #64 \n"
"shl v6.8b, v7.8b, #2 \n" "shl v4.16b, v7.16b, #6 \n"
"shl v5.8b, v7.8b, #4 \n" "shl v5.16b, v7.16b, #4 \n"
"shl v4.8b, v7.8b, #6 \n" "shl v6.16b, v7.16b, #2 \n"
"zip1 v0.16b, v4.16b, v0.16b \n" "subs %2, %2, #80 \n"
"zip1 v1.16b, v5.16b, v1.16b \n" "zip1 v16.16b, v4.16b, v0.16b \n"
"zip1 v2.16b, v6.16b, v2.16b \n" "zip1 v18.16b, v5.16b, v1.16b \n"
"zip1 v3.16b, v7.16b, v3.16b \n" "zip1 v20.16b, v6.16b, v2.16b \n"
"sri v0.8h, v0.8h, #10 \n" "zip1 v22.16b, v7.16b, v3.16b \n"
"sri v1.8h, v1.8h, #10 \n" "zip2 v17.16b, v4.16b, v0.16b \n"
"sri v2.8h, v2.8h, #10 \n" "zip2 v19.16b, v5.16b, v1.16b \n"
"sri v3.8h, v3.8h, #10 \n" "zip2 v21.16b, v6.16b, v2.16b \n"
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" "zip2 v23.16b, v7.16b, v3.16b \n"
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n" "sri v16.8h, v16.8h, #10 \n"
"ld1 {v7.8b}, [%0], #8 \n" "sri v17.8h, v17.8h, #10 \n"
"shl v6.8b, v7.8b, #2 \n" "sri v18.8h, v18.8h, #10 \n"
"shl v5.8b, v7.8b, #4 \n" "sri v19.8h, v19.8h, #10 \n"
"shl v4.8b, v7.8b, #6 \n" "st1 {v16.8h-v19.8h}, [%1], #64 \n"
"zip1 v0.16b, v4.16b, v0.16b \n" "sri v20.8h, v20.8h, #10 \n"
"zip1 v1.16b, v5.16b, v1.16b \n" "sri v21.8h, v21.8h, #10 \n"
"zip1 v2.16b, v6.16b, v2.16b \n" "sri v22.8h, v22.8h, #10 \n"
"zip1 v3.16b, v7.16b, v3.16b \n" "sri v23.8h, v23.8h, #10 \n"
"sri v0.8h, v0.8h, #10 \n" "st1 {v20.8h-v23.8h}, [%1], #64 \n"
"sri v1.8h, v1.8h, #10 \n" "b.gt 1b \n"
"sri v2.8h, v2.8h, #10 \n" : "+r"(src), // %0
"sri v3.8h, v3.8h, #10 \n" "+r"(dst), // %1
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n" "+r"(size) // %2
"mov %0, %1 \n"
"add %1, %0, #16 \n"
"subs %3, %3, #80 \n"
"b.gt 1b \n"
: "+r"(src_lower_bits), // %0
"+r"(src_upper_bits), // %1
"+r"(dst), // %2
"+r"(size) // %3
: :
: "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v8", "v9", "v10", "v11", "v12"); "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
} }
#if LIBYUV_USE_ST2 #if LIBYUV_USE_ST2