mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
M2T2 Unpack fixes
Fix the algorithm for unpacking the lower 2 bits of M2T2 pixels. Bug: b:258474032 Change-Id: Iea1d63f26e3f127a70ead26bc04ea3d939e793e3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4337978 Commit-Queue: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
f9b23b9cc0
commit
76468711d5
@ -56,10 +56,6 @@ extern "C" {
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_TRANSPOSEWX8_NEON
|
||||
#define HAS_TRANSPOSEUVWX8_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||
#define HAS_TRANSPOSE4X4_32_NEON
|
||||
#endif
|
||||
|
||||
|
||||
@ -410,6 +410,47 @@ void TransposeUVWx8_NEON(const uint8_t* src,
|
||||
: "r"(&kVTbl4x4TransposeDi) // %8
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
|
||||
}
|
||||
|
||||
|
||||
// Transpose 32 bit values (ARGB)
|
||||
void Transpose4x4_32_NEON(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst,
|
||||
int dst_stride,
|
||||
int width) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
const uint8_t* src2 = src1 + src_stride;
|
||||
const uint8_t* src3 = src2 + src_stride;
|
||||
uint8_t* dst1 = dst + dst_stride;
|
||||
uint8_t* dst2 = dst1 + dst_stride;
|
||||
uint8_t* dst3 = dst2 + dst_stride;
|
||||
asm volatile(
|
||||
// Main loop transpose 4x4. Read a column, write a row.
|
||||
"1: \n"
|
||||
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"
|
||||
"vld4.32 {d0[1], d2[1], d4[1], d6[1]}, [%1], %9 \n"
|
||||
"vld4.32 {d1[0], d3[0], d5[0], d7[0]}, [%2], %9 \n"
|
||||
"vld4.32 {d1[1], d3[1], d5[1], d7[1]}, [%3], %9 \n"
|
||||
"subs %8, %8, #4 \n" // w -= 4
|
||||
"vst1.8 {q0}, [%4]! \n"
|
||||
"vst1.8 {q1}, [%5]! \n"
|
||||
"vst1.8 {q2}, [%6]! \n"
|
||||
"vst1.8 {q3}, [%7]! \n"
|
||||
"bgt 1b \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src1), // %1
|
||||
"+r"(src2), // %2
|
||||
"+r"(src3), // %3
|
||||
"+r"(dst), // %4
|
||||
"+r"(dst1), // %5
|
||||
"+r"(dst2), // %6
|
||||
"+r"(dst3), // %7
|
||||
"+r"(width) // %8
|
||||
: "r"((ptrdiff_t)(src_stride * 4)) // %9
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
#endif // defined(__ARM_NEON__) && !defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -2868,24 +2868,21 @@ void DetileToYUY2_C(const uint8_t* src_y,
|
||||
// Unpack MT2T into tiled P010 64 pixels at a time. MT2T's bitstream is encoded
|
||||
// in 80 byte blocks representing 64 pixels each. The first 16 bytes of the
|
||||
// block contain all of the lower 2 bits of each pixel packed together, and the
|
||||
// next 64 bytes represent all the upper 8 bits of the pixel.
|
||||
// next 64 bytes represent all the upper 8 bits of the pixel. The lower bits are
|
||||
// packed into 1x4 blocks, whereas the upper bits are packed in normal raster
|
||||
// order.
|
||||
void UnpackMT2T_C(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
for (size_t i = 0; i < size; i += 80) {
|
||||
const uint8_t* src_lower_bits = src;
|
||||
const uint8_t* src_upper_bits = src + 16;
|
||||
|
||||
for (int j = 0; j < 16; j++) {
|
||||
uint8_t lower_bits = src_lower_bits[j];
|
||||
*dst++ = (lower_bits & 0x03) << 6 | (uint16_t)src_upper_bits[j * 4] << 8 |
|
||||
(uint16_t)src_upper_bits[j * 4] >> 2;
|
||||
*dst++ = (lower_bits & 0x0C) << 4 |
|
||||
(uint16_t)src_upper_bits[j * 4 + 1] << 8 |
|
||||
(uint16_t)src_upper_bits[j * 4 + 1] >> 2;
|
||||
*dst++ = (lower_bits & 0x30) << 2 |
|
||||
(uint16_t)src_upper_bits[j * 4 + 2] << 8 |
|
||||
(uint16_t)src_upper_bits[j * 4 + 2] >> 2;
|
||||
*dst++ = (lower_bits & 0xC0) | (uint16_t)src_upper_bits[j * 4 + 3] << 8 |
|
||||
(uint16_t)src_upper_bits[j * 4 + 3] >> 2;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
for (int k = 0; k < 16; k++) {
|
||||
*dst++ = ((src_lower_bits[k] >> (j * 2)) & 0x3) << 6 |
|
||||
(uint16_t)*src_upper_bits << 8 |
|
||||
(uint16_t)*src_upper_bits >> 2;
|
||||
src_upper_bits++;
|
||||
}
|
||||
}
|
||||
|
||||
src += 80;
|
||||
@ -4547,4 +4544,4 @@ void HalfMergeUVRow_C(const uint8_t* src_u,
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -721,57 +721,43 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
#endif
|
||||
|
||||
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
const uint8_t* src_lower_bits = src;
|
||||
const uint8_t* src_upper_bits = src + 16;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Load 32 bytes of upper
|
||||
// bits.
|
||||
"vld1.8 {d6}, [%0]! \n" // Load 8 bytes of lower
|
||||
// bits.
|
||||
"vshl.u8 d4, d6, #2 \n" // Align lower bits.
|
||||
"vshl.u8 d2, d6, #4 \n"
|
||||
"vshl.u8 d0, d6, #6 \n"
|
||||
"vzip.u8 d0, d1 \n" // Zip lower and upper
|
||||
// bits together.
|
||||
"vzip.u8 d2, d3 \n"
|
||||
"vzip.u8 d4, d5 \n"
|
||||
"vzip.u8 d6, d7 \n"
|
||||
"vsri.u16 q0, q0, #10 \n" // Copy upper 6 bits into
|
||||
// lower 6 bits for better
|
||||
// accuracy in
|
||||
// conversions.
|
||||
"vsri.u16 q1, q1, #10 \n"
|
||||
"vsri.u16 q2, q2, #10 \n"
|
||||
"vsri.u16 q3, q3, #10 \n"
|
||||
"vst4.16 {d0, d2, d4, d6}, [%2]! \n" // Store 32 pixels
|
||||
"vst4.16 {d1, d3, d5, d7}, [%2]! \n"
|
||||
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // Process last 32 pixels
|
||||
// in the block
|
||||
"vld1.8 {d6}, [%0]! \n"
|
||||
"vshl.u8 d4, d6, #2 \n"
|
||||
"vshl.u8 d2, d6, #4 \n"
|
||||
"vshl.u8 d0, d6, #6 \n"
|
||||
"vzip.u8 d0, d1 \n"
|
||||
"vzip.u8 d2, d3 \n"
|
||||
"vzip.u8 d4, d5 \n"
|
||||
"vzip.u8 d6, d7 \n"
|
||||
"vsri.u16 q0, q0, #10 \n"
|
||||
"vsri.u16 q1, q1, #10 \n"
|
||||
"vsri.u16 q2, q2, #10 \n"
|
||||
"vsri.u16 q3, q3, #10 \n"
|
||||
"vst4.16 {d0, d2, d4, d6}, [%2]! \n"
|
||||
"vst4.16 {d1, d3, d5, d7}, [%2]! \n"
|
||||
"mov %0, %1 \n"
|
||||
"add %1, %0, #16 \n"
|
||||
"subs %3, %3, #80 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_lower_bits), // %0
|
||||
"+r"(src_upper_bits), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(size) // %3
|
||||
"1: \n"
|
||||
"vld1.8 q14, [%0]! \n" // Load lower bits.
|
||||
"vld1.8 q9, [%0]! \n" // Load upper bits row
|
||||
// by row.
|
||||
"vld1.8 q11, [%0]! \n"
|
||||
"vld1.8 q13, [%0]! \n"
|
||||
"vld1.8 q15, [%0]! \n"
|
||||
"vshl.u8 q8, q14, #6 \n" // Shift lower bit data
|
||||
// appropriately.
|
||||
"vshl.u8 q10, q14, #4 \n"
|
||||
"vshl.u8 q12, q14, #2 \n"
|
||||
"vzip.u8 q8, q9 \n" // Interleave upper and
|
||||
// lower bits.
|
||||
"vzip.u8 q10, q11 \n"
|
||||
"vzip.u8 q12, q13 \n"
|
||||
"vzip.u8 q14, q15 \n"
|
||||
"vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits
|
||||
// into lower 6 bits for
|
||||
// better accuracy in
|
||||
// conversions.
|
||||
"vsri.u16 q9, q9, #10 \n"
|
||||
"vsri.u16 q10, q10, #10 \n"
|
||||
"vsri.u16 q11, q11, #10 \n"
|
||||
"vsri.u16 q12, q12, #10 \n"
|
||||
"vsri.u16 q13, q13, #10 \n"
|
||||
"vsri.u16 q14, q14, #10 \n"
|
||||
"vsri.u16 q15, q15, #10 \n"
|
||||
"vstmia %1!, {q8-q15} \n" // Store pixel block (64
|
||||
// pixels).
|
||||
"subs %2, %2, #80 \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(size) // %2
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
||||
: "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
|
||||
}
|
||||
|
||||
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
||||
|
||||
@ -752,49 +752,39 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
// Unpack MT2T into tiled P010 64 pixels at a time. See
|
||||
// tinyurl.com/mtk-10bit-video-format for format documentation.
|
||||
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
|
||||
const uint8_t* src_lower_bits = src;
|
||||
const uint8_t* src_upper_bits = src + 16;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
|
||||
"ld1 {v7.8b}, [%0], #8 \n"
|
||||
"shl v6.8b, v7.8b, #2 \n"
|
||||
"shl v5.8b, v7.8b, #4 \n"
|
||||
"shl v4.8b, v7.8b, #6 \n"
|
||||
"zip1 v0.16b, v4.16b, v0.16b \n"
|
||||
"zip1 v1.16b, v5.16b, v1.16b \n"
|
||||
"zip1 v2.16b, v6.16b, v2.16b \n"
|
||||
"zip1 v3.16b, v7.16b, v3.16b \n"
|
||||
"sri v0.8h, v0.8h, #10 \n"
|
||||
"sri v1.8h, v1.8h, #10 \n"
|
||||
"sri v2.8h, v2.8h, #10 \n"
|
||||
"sri v3.8h, v3.8h, #10 \n"
|
||||
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
|
||||
"ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%1], #32 \n"
|
||||
"ld1 {v7.8b}, [%0], #8 \n"
|
||||
"shl v6.8b, v7.8b, #2 \n"
|
||||
"shl v5.8b, v7.8b, #4 \n"
|
||||
"shl v4.8b, v7.8b, #6 \n"
|
||||
"zip1 v0.16b, v4.16b, v0.16b \n"
|
||||
"zip1 v1.16b, v5.16b, v1.16b \n"
|
||||
"zip1 v2.16b, v6.16b, v2.16b \n"
|
||||
"zip1 v3.16b, v7.16b, v3.16b \n"
|
||||
"sri v0.8h, v0.8h, #10 \n"
|
||||
"sri v1.8h, v1.8h, #10 \n"
|
||||
"sri v2.8h, v2.8h, #10 \n"
|
||||
"sri v3.8h, v3.8h, #10 \n"
|
||||
"st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%2], #64 \n"
|
||||
"mov %0, %1 \n"
|
||||
"add %1, %0, #16 \n"
|
||||
"subs %3, %3, #80 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_lower_bits), // %0
|
||||
"+r"(src_upper_bits), // %1
|
||||
"+r"(dst), // %2
|
||||
"+r"(size) // %3
|
||||
"1: \n"
|
||||
"ld1 {v7.16b}, [%0], #16 \n"
|
||||
"ld1 {v0.16b-v3.16b}, [%0], #64 \n"
|
||||
"shl v4.16b, v7.16b, #6 \n"
|
||||
"shl v5.16b, v7.16b, #4 \n"
|
||||
"shl v6.16b, v7.16b, #2 \n"
|
||||
"subs %2, %2, #80 \n"
|
||||
"zip1 v16.16b, v4.16b, v0.16b \n"
|
||||
"zip1 v18.16b, v5.16b, v1.16b \n"
|
||||
"zip1 v20.16b, v6.16b, v2.16b \n"
|
||||
"zip1 v22.16b, v7.16b, v3.16b \n"
|
||||
"zip2 v17.16b, v4.16b, v0.16b \n"
|
||||
"zip2 v19.16b, v5.16b, v1.16b \n"
|
||||
"zip2 v21.16b, v6.16b, v2.16b \n"
|
||||
"zip2 v23.16b, v7.16b, v3.16b \n"
|
||||
"sri v16.8h, v16.8h, #10 \n"
|
||||
"sri v17.8h, v17.8h, #10 \n"
|
||||
"sri v18.8h, v18.8h, #10 \n"
|
||||
"sri v19.8h, v19.8h, #10 \n"
|
||||
"st1 {v16.8h-v19.8h}, [%1], #64 \n"
|
||||
"sri v20.8h, v20.8h, #10 \n"
|
||||
"sri v21.8h, v21.8h, #10 \n"
|
||||
"sri v22.8h, v22.8h, #10 \n"
|
||||
"sri v23.8h, v23.8h, #10 \n"
|
||||
"st1 {v20.8h-v23.8h}, [%1], #64 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(size) // %2
|
||||
:
|
||||
: "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v8", "v9", "v10", "v11", "v12");
|
||||
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
|
||||
}
|
||||
|
||||
#if LIBYUV_USE_ST2
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user