mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Add Detile_16 planar function for 10 bit MT2T format
- Neon and SSE2 - Any for odd widths Pixel 2 little core AArch32 build C TestDetilePlane_16 (1275 ms) TestDetilePlane (1203 ms) Neon TestDetilePlane_16 (693 ms) TestDetilePlane (660 ms) Bug: b/258474032 Change-Id: Idbd09c5e9324e4deef5f1d54090d4b63cc7db812 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4031848 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
6f21862f1b
commit
2d2cee418a
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1848
|
Version: 1849
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -85,13 +85,23 @@ void SetPlane(uint8_t* dst_y,
|
|||||||
|
|
||||||
// Convert a plane of tiles of 16 x H to linear.
|
// Convert a plane of tiles of 16 x H to linear.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
void DetilePlane(const uint8_t* src_y,
|
int DetilePlane(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int dst_stride_y,
|
int dst_stride_y,
|
||||||
int width,
|
int width,
|
||||||
int height,
|
int height,
|
||||||
int tile_height);
|
int tile_height);
|
||||||
|
|
||||||
|
// Convert a plane of 16 bit tiles of 16 x H to linear.
|
||||||
|
LIBYUV_API
|
||||||
|
int DetilePlane_16(const uint16_t* src_y,
|
||||||
|
int src_stride_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int dst_stride_y,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
int tile_height);
|
||||||
|
|
||||||
// Convert a UV plane of tiles of 16 x H into linear U and V planes.
|
// Convert a UV plane of tiles of 16 x H into linear U and V planes.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
@ -106,6 +116,7 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
|
|||||||
int tile_height);
|
int tile_height);
|
||||||
|
|
||||||
// Convert a Y and UV plane of tiles into interlaced YUY2.
|
// Convert a Y and UV plane of tiles into interlaced YUY2.
|
||||||
|
LIBYUV_API
|
||||||
void DetileToYUY2(const uint8_t* src_y,
|
void DetileToYUY2(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
const uint8_t* src_uv,
|
const uint8_t* src_uv,
|
||||||
@ -382,6 +393,7 @@ int I210Copy(const uint16_t* src_y,
|
|||||||
int height);
|
int height);
|
||||||
|
|
||||||
// Copy NV12. Supports inverting.
|
// Copy NV12. Supports inverting.
|
||||||
|
LIBYUV_API
|
||||||
int NV12Copy(const uint8_t* src_y,
|
int NV12Copy(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
const uint8_t* src_uv,
|
const uint8_t* src_uv,
|
||||||
@ -394,6 +406,7 @@ int NV12Copy(const uint8_t* src_y,
|
|||||||
int height);
|
int height);
|
||||||
|
|
||||||
// Copy NV21. Supports inverting.
|
// Copy NV21. Supports inverting.
|
||||||
|
LIBYUV_API
|
||||||
int NV21Copy(const uint8_t* src_y,
|
int NV21Copy(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
const uint8_t* src_vu,
|
const uint8_t* src_vu,
|
||||||
|
|||||||
@ -290,6 +290,7 @@ extern "C" {
|
|||||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||||
#define HAS_CONVERT8TO16ROW_SSE2
|
#define HAS_CONVERT8TO16ROW_SSE2
|
||||||
#define HAS_DETILEROW_SSE2
|
#define HAS_DETILEROW_SSE2
|
||||||
|
#define HAS_DETILEROW_16_SSE2
|
||||||
#define HAS_DETILESPLITUVROW_SSSE3
|
#define HAS_DETILESPLITUVROW_SSSE3
|
||||||
#define HAS_DETILETOYUY2_SSE2
|
#define HAS_DETILETOYUY2_SSE2
|
||||||
#define HAS_HALFMERGEUVROW_SSSE3
|
#define HAS_HALFMERGEUVROW_SSSE3
|
||||||
@ -449,6 +450,7 @@ extern "C" {
|
|||||||
#define HAS_BYTETOFLOATROW_NEON
|
#define HAS_BYTETOFLOATROW_NEON
|
||||||
#define HAS_CONVERT16TO8ROW_NEON
|
#define HAS_CONVERT16TO8ROW_NEON
|
||||||
#define HAS_COPYROW_NEON
|
#define HAS_COPYROW_NEON
|
||||||
|
#define HAS_DETILEROW_16_NEON
|
||||||
#define HAS_DETILEROW_NEON
|
#define HAS_DETILEROW_NEON
|
||||||
#define HAS_DETILESPLITUVROW_NEON
|
#define HAS_DETILESPLITUVROW_NEON
|
||||||
#define HAS_DETILETOYUY2_NEON
|
#define HAS_DETILETOYUY2_NEON
|
||||||
@ -823,7 +825,8 @@ struct YuvConstants {
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
|
#define IS_POWEROFTWO(x) (!((x) & ((x) - 1)))
|
||||||
|
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
|
||||||
|
|
||||||
#define align_buffer_64(var, size) \
|
#define align_buffer_64(var, size) \
|
||||||
uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \
|
uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \
|
||||||
@ -2012,7 +2015,6 @@ void DetileRow_C(const uint8_t* src,
|
|||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
void DetileRow_NEON(const uint8_t* src,
|
void DetileRow_NEON(const uint8_t* src,
|
||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
@ -2029,6 +2031,26 @@ void DetileRow_Any_SSE2(const uint8_t* src,
|
|||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int width);
|
int width);
|
||||||
|
void DetileRow_16_C(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileRow_16_NEON(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileRow_16_Any_NEON(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileRow_16_SSE2(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileRow_16_Any_SSE2(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width);
|
||||||
void DetileSplitUVRow_C(const uint8_t* src_uv,
|
void DetileSplitUVRow_C(const uint8_t* src_uv,
|
||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1848
|
#define LIBYUV_VERSION 1849
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -385,6 +385,7 @@ int I420ToI400(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Copy NV12. Supports inverting.
|
// Copy NV12. Supports inverting.
|
||||||
|
LIBYUV_API
|
||||||
int NV12Copy(const uint8_t* src_y,
|
int NV12Copy(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
const uint8_t* src_uv,
|
const uint8_t* src_uv,
|
||||||
@ -418,6 +419,7 @@ int NV12Copy(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Copy NV21. Supports inverting.
|
// Copy NV21. Supports inverting.
|
||||||
|
LIBYUV_API
|
||||||
int NV21Copy(const uint8_t* src_y,
|
int NV21Copy(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
const uint8_t* src_vu,
|
const uint8_t* src_vu,
|
||||||
@ -916,26 +918,22 @@ int NV21ToNV12(const uint8_t* src_y,
|
|||||||
// tile_height is 16 or 32 for MM21.
|
// tile_height is 16 or 32 for MM21.
|
||||||
// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
|
// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
|
||||||
// TODO: More detile row functions.
|
// TODO: More detile row functions.
|
||||||
|
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
void DetilePlane(const uint8_t* src_y,
|
int DetilePlane(const uint8_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
uint8_t* dst_y,
|
uint8_t* dst_y,
|
||||||
int dst_stride_y,
|
int dst_stride_y,
|
||||||
int width,
|
int width,
|
||||||
int height,
|
int height,
|
||||||
int tile_height) {
|
int tile_height) {
|
||||||
const ptrdiff_t src_tile_stride = 16 * tile_height;
|
const ptrdiff_t src_tile_stride = 16 * tile_height;
|
||||||
int y;
|
int y;
|
||||||
void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
|
void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
|
||||||
int width) = DetileRow_C;
|
int width) = DetileRow_C;
|
||||||
assert(src_stride_y >= 0);
|
if (!src_y || !dst_y || width <= 0 || height == 0 || !IS_POWEROFTWO(tile_height)) {
|
||||||
assert(tile_height > 0);
|
return -1;
|
||||||
assert(src_stride_y > 0);
|
|
||||||
|
|
||||||
if (width <= 0 || height == 0) {
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Negative height means invert the image.
|
// Negative height means invert the image.
|
||||||
if (height < 0) {
|
if (height < 0) {
|
||||||
height = -height;
|
height = -height;
|
||||||
@ -970,6 +968,63 @@ void DetilePlane(const uint8_t* src_y,
|
|||||||
src_y = src_y - src_tile_stride + src_stride_y * tile_height;
|
src_y = src_y - src_tile_stride + src_stride_y * tile_height;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert a plane of 16 bit tiles of 16 x H to linear.
|
||||||
|
// tile width is 16 and assumed.
|
||||||
|
// tile_height is 16 or 32 for MT2T.
|
||||||
|
LIBYUV_API
|
||||||
|
int DetilePlane_16(const uint16_t* src_y,
|
||||||
|
int src_stride_y,
|
||||||
|
uint16_t* dst_y,
|
||||||
|
int dst_stride_y,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
int tile_height) {
|
||||||
|
const ptrdiff_t src_tile_stride = 16 * tile_height;
|
||||||
|
int y;
|
||||||
|
void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst, int width) = DetileRow_16_C;
|
||||||
|
if (!src_y || !dst_y || width <= 0 || height == 0 || !IS_POWEROFTWO(tile_height)) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Negative height means invert the image.
|
||||||
|
if (height < 0) {
|
||||||
|
height = -height;
|
||||||
|
dst_y = dst_y + (height - 1) * dst_stride_y;
|
||||||
|
dst_stride_y = -dst_stride_y;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(HAS_DETILEROW_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
DetileRow_16 = DetileRow_16_Any_SSE2;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
DetileRow_16 = DetileRow_16_SSE2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_DETILEROW_NEON)
|
||||||
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
|
DetileRow_16 = DetileRow_16_Any_NEON;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
DetileRow_16 = DetileRow_16_NEON;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Detile plane
|
||||||
|
for (y = 0; y < height; ++y) {
|
||||||
|
DetileRow_16(src_y, src_tile_stride, dst_y, width);
|
||||||
|
dst_y += dst_stride_y;
|
||||||
|
src_y += 16;
|
||||||
|
// Advance to next row of tiles.
|
||||||
|
if ((y & (tile_height - 1)) == (tile_height - 1)) {
|
||||||
|
src_y = src_y - src_tile_stride + src_stride_y * tile_height;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
|
|||||||
@ -2242,26 +2242,31 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
|
|||||||
#endif
|
#endif
|
||||||
#undef ANY11S
|
#undef ANY11S
|
||||||
|
|
||||||
#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \
|
#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \
|
||||||
void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
|
void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \
|
||||||
int width) { \
|
SIMD_ALIGNED(T temp[16 * 2]); \
|
||||||
SIMD_ALIGNED(uint8_t temp[16 * 2]); \
|
memset(temp, 0, 16 * BPP); /* for msan */ \
|
||||||
memset(temp, 0, 16); /* for msan */ \
|
int r = width & MASK; \
|
||||||
int r = width & MASK; \
|
int n = width & ~MASK; \
|
||||||
int n = width & ~MASK; \
|
if (n > 0) { \
|
||||||
if (n > 0) { \
|
ANY_SIMD(src, src_tile_stride, dst, n); \
|
||||||
ANY_SIMD(src, src_tile_stride, dst, n); \
|
} \
|
||||||
} \
|
memcpy(temp, src + (n / 16) * src_tile_stride, r * BPP); \
|
||||||
memcpy(temp, src + (n / 16) * src_tile_stride, r); \
|
ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \
|
||||||
ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \
|
memcpy(dst + n, temp + 16, r * BPP); \
|
||||||
memcpy(dst + n, temp + 16, r); \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_DETILEROW_NEON
|
#ifdef HAS_DETILEROW_NEON
|
||||||
ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
|
ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_DETILEROW_SSE2
|
#ifdef HAS_DETILEROW_SSE2
|
||||||
ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
|
ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_DETILEROW_16_NEON
|
||||||
|
ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_DETILEROW_16_SSE2
|
||||||
|
ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \
|
#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \
|
||||||
|
|||||||
@ -2748,6 +2748,21 @@ void DetileRow_C(const uint8_t* src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DetileRow_16_C(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width) {
|
||||||
|
int x;
|
||||||
|
for (x = 0; x < width - 15; x += 16) {
|
||||||
|
memcpy(dst, src, 16 * sizeof(uint16_t));
|
||||||
|
dst += 16;
|
||||||
|
src += src_tile_stride;
|
||||||
|
}
|
||||||
|
if (width & 15) {
|
||||||
|
memcpy(dst, src, (width & 15) * sizeof(uint16_t));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void DetileSplitUVRow_C(const uint8_t* src_uv,
|
void DetileSplitUVRow_C(const uint8_t* src_uv,
|
||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
|
|||||||
@ -5030,6 +5030,29 @@ void DetileRow_SSE2(const uint8_t* src,
|
|||||||
}
|
}
|
||||||
#endif // HAS_DETILEROW_SSE2
|
#endif // HAS_DETILEROW_SSE2
|
||||||
|
|
||||||
|
#ifdef HAS_DETILEROW_16_SSE2
|
||||||
|
void DetileRow_16_SSE2(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"movdqu (%0),%%xmm0 \n"
|
||||||
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
|
"lea (%0,%3,2),%0 \n"
|
||||||
|
"movdqu %%xmm0,(%1) \n"
|
||||||
|
"movdqu %%xmm1,0x10(%1) \n"
|
||||||
|
"lea 0x20(%1),%1 \n"
|
||||||
|
"sub $0x10,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(src_tile_stride) // %3
|
||||||
|
: "cc", "memory", "xmm0", "xmm1");
|
||||||
|
}
|
||||||
|
#endif // HAS_DETILEROW_SSE2
|
||||||
|
|
||||||
#ifdef HAS_DETILETOYUY2_SSE2
|
#ifdef HAS_DETILETOYUY2_SSE2
|
||||||
// Read 16 Y, 8 UV, and write 8 YUYV.
|
// Read 16 Y, 8 UV, and write 8 YUYV.
|
||||||
void DetileToYUY2_SSE2(const uint8_t* src_y,
|
void DetileToYUY2_SSE2(const uint8_t* src_y,
|
||||||
|
|||||||
@ -622,6 +622,26 @@ void DetileRow_NEON(const uint8_t* src,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
|
||||||
|
void DetileRow_16_NEON(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
|
||||||
|
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||||
|
"pld [%0, #3584] \n"
|
||||||
|
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
|
||||||
|
"bgt 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(src_tile_stride * 2) // %3
|
||||||
|
: "cc", "memory", "q0", "q1" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
|
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
|
||||||
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
|
|||||||
@ -650,6 +650,26 @@ void DetileRow_NEON(const uint8_t* src,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
|
||||||
|
void DetileRow_16_NEON(const uint16_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint16_t* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels
|
||||||
|
"subs %w2, %w2, #16 \n" // 16 processed per loop
|
||||||
|
"prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead
|
||||||
|
"st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(src_tile_stride * 2) // %3
|
||||||
|
: "cc", "memory", "v0", "v1" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
|
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
|
||||||
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
|
|||||||
@ -1638,29 +1638,29 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
|
|||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
// orig is tiled. Allocate enough memory for tiles.
|
// orig is tiled. Allocate enough memory for tiles.
|
||||||
int orig_width = (benchmark_width_ + 15) & ~15;
|
int tile_width = (benchmark_width_ + 15) & ~15;
|
||||||
int orig_height = (benchmark_height_ + 15) & ~15;
|
int tile_height = (benchmark_height_ + 15) & ~15;
|
||||||
int orig_plane_size = orig_width * orig_height;
|
int tile_plane_size = tile_width * tile_height;
|
||||||
int y_plane_size = benchmark_width_ * benchmark_height_;
|
int y_plane_size = benchmark_width_ * benchmark_height_;
|
||||||
align_buffer_page_end(orig_y, orig_plane_size);
|
align_buffer_page_end(tile_y, tile_plane_size);
|
||||||
align_buffer_page_end(dst_c, y_plane_size);
|
align_buffer_page_end(dst_c, y_plane_size);
|
||||||
align_buffer_page_end(dst_opt, y_plane_size);
|
align_buffer_page_end(dst_opt, y_plane_size);
|
||||||
|
|
||||||
MemRandomize(orig_y, orig_plane_size);
|
MemRandomize(tile_y, tile_plane_size);
|
||||||
memset(dst_c, 0, y_plane_size);
|
memset(dst_c, 0, y_plane_size);
|
||||||
memset(dst_opt, 0, y_plane_size);
|
memset(dst_opt, 0, y_plane_size);
|
||||||
|
|
||||||
// Disable all optimizations.
|
// Disable all optimizations.
|
||||||
MaskCpuFlags(disable_cpu_flags_);
|
MaskCpuFlags(disable_cpu_flags_);
|
||||||
for (j = 0; j < benchmark_iterations_; j++) {
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_,
|
DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_,
|
||||||
benchmark_height_, 16);
|
benchmark_height_, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enable optimizations.
|
// Enable optimizations.
|
||||||
MaskCpuFlags(benchmark_cpu_info_);
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
for (j = 0; j < benchmark_iterations_; j++) {
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_,
|
DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_,
|
||||||
benchmark_height_, 16);
|
benchmark_height_, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1668,7 +1668,46 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
|
|||||||
EXPECT_EQ(dst_c[i], dst_opt[i]);
|
EXPECT_EQ(dst_c[i], dst_opt[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_aligned_buffer_page_end(orig_y);
|
free_aligned_buffer_page_end(tile_y);
|
||||||
|
free_aligned_buffer_page_end(dst_c);
|
||||||
|
free_aligned_buffer_page_end(dst_opt);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
// orig is tiled. Allocate enough memory for tiles.
|
||||||
|
int tile_width = (benchmark_width_ + 15) & ~15;
|
||||||
|
int tile_height = (benchmark_height_ + 15) & ~15;
|
||||||
|
int tile_plane_size = tile_width * tile_height * 2;
|
||||||
|
int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
|
||||||
|
align_buffer_page_end(tile_y, tile_plane_size);
|
||||||
|
align_buffer_page_end(dst_c, y_plane_size);
|
||||||
|
align_buffer_page_end(dst_opt, y_plane_size);
|
||||||
|
|
||||||
|
MemRandomize(tile_y, tile_plane_size);
|
||||||
|
memset(dst_c, 0, y_plane_size);
|
||||||
|
memset(dst_opt, 0, y_plane_size);
|
||||||
|
|
||||||
|
// Disable all optimizations.
|
||||||
|
MaskCpuFlags(disable_cpu_flags_);
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c,
|
||||||
|
benchmark_width_, benchmark_width_, benchmark_height_, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enable optimizations.
|
||||||
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt,
|
||||||
|
benchmark_width_, benchmark_width_, benchmark_height_, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < y_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(dst_c[i], dst_opt[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_aligned_buffer_page_end(tile_y);
|
||||||
free_aligned_buffer_page_end(dst_c);
|
free_aligned_buffer_page_end(dst_c);
|
||||||
free_aligned_buffer_page_end(dst_opt);
|
free_aligned_buffer_page_end(dst_opt);
|
||||||
}
|
}
|
||||||
@ -1678,33 +1717,33 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
|
|||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
// orig is tiled. Allocate enough memory for tiles.
|
// orig is tiled. Allocate enough memory for tiles.
|
||||||
int orig_width = (benchmark_width_ + 15) & ~15;
|
int tile_width = (benchmark_width_ + 15) & ~15;
|
||||||
int orig_height = (benchmark_height_ + 15) & ~15;
|
int tile_height = (benchmark_height_ + 15) & ~15;
|
||||||
int orig_plane_size = orig_width * orig_height;
|
int tile_plane_size = tile_width * tile_height;
|
||||||
int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
|
int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
|
||||||
align_buffer_page_end(orig_uv, orig_plane_size);
|
align_buffer_page_end(tile_uv, tile_plane_size);
|
||||||
align_buffer_page_end(detiled_uv, orig_plane_size);
|
align_buffer_page_end(detiled_uv, tile_plane_size);
|
||||||
align_buffer_page_end(dst_u_two_stage, uv_plane_size);
|
align_buffer_page_end(dst_u_two_stage, uv_plane_size);
|
||||||
align_buffer_page_end(dst_u_opt, uv_plane_size);
|
align_buffer_page_end(dst_u_opt, uv_plane_size);
|
||||||
align_buffer_page_end(dst_v_two_stage, uv_plane_size);
|
align_buffer_page_end(dst_v_two_stage, uv_plane_size);
|
||||||
align_buffer_page_end(dst_v_opt, uv_plane_size);
|
align_buffer_page_end(dst_v_opt, uv_plane_size);
|
||||||
|
|
||||||
MemRandomize(orig_uv, orig_plane_size);
|
MemRandomize(tile_uv, tile_plane_size);
|
||||||
memset(detiled_uv, 0, orig_plane_size);
|
memset(detiled_uv, 0, tile_plane_size);
|
||||||
memset(dst_u_two_stage, 0, uv_plane_size);
|
memset(dst_u_two_stage, 0, uv_plane_size);
|
||||||
memset(dst_u_opt, 0, uv_plane_size);
|
memset(dst_u_opt, 0, uv_plane_size);
|
||||||
memset(dst_v_two_stage, 0, uv_plane_size);
|
memset(dst_v_two_stage, 0, uv_plane_size);
|
||||||
memset(dst_v_opt, 0, uv_plane_size);
|
memset(dst_v_opt, 0, uv_plane_size);
|
||||||
|
|
||||||
DetileSplitUVPlane(orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2,
|
DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2,
|
||||||
dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
|
dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
|
||||||
benchmark_height_, 16);
|
benchmark_height_, 16);
|
||||||
|
|
||||||
// Benchmark 2 step conversion for comparison.
|
// Benchmark 2 step conversion for comparison.
|
||||||
for (j = 0; j < benchmark_iterations_; j++) {
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_,
|
DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_,
|
||||||
benchmark_width_, benchmark_height_, 16);
|
benchmark_width_, benchmark_height_, 16);
|
||||||
SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage,
|
SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage,
|
||||||
(benchmark_width_ + 1) / 2, dst_v_two_stage,
|
(benchmark_width_ + 1) / 2, dst_v_two_stage,
|
||||||
(benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
|
(benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
|
||||||
benchmark_height_);
|
benchmark_height_);
|
||||||
@ -1715,7 +1754,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
|
|||||||
EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
|
EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_aligned_buffer_page_end(orig_uv);
|
free_aligned_buffer_page_end(tile_uv);
|
||||||
free_aligned_buffer_page_end(detiled_uv);
|
free_aligned_buffer_page_end(detiled_uv);
|
||||||
free_aligned_buffer_page_end(dst_u_two_stage);
|
free_aligned_buffer_page_end(dst_u_two_stage);
|
||||||
free_aligned_buffer_page_end(dst_u_opt);
|
free_aligned_buffer_page_end(dst_u_opt);
|
||||||
@ -1727,17 +1766,17 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
|
|||||||
int i, j;
|
int i, j;
|
||||||
|
|
||||||
// orig is tiled. Allocate enough memory for tiles.
|
// orig is tiled. Allocate enough memory for tiles.
|
||||||
int orig_width = (benchmark_width_ + 15) & ~15;
|
int tile_width = (benchmark_width_ + 15) & ~15;
|
||||||
int orig_height = (benchmark_height_ + 15) & ~15;
|
int tile_height = (benchmark_height_ + 15) & ~15;
|
||||||
int orig_plane_size = orig_width * orig_height;
|
int tile_plane_size = tile_width * tile_height;
|
||||||
int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
|
int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
|
||||||
align_buffer_page_end(orig_uv, orig_plane_size);
|
align_buffer_page_end(tile_uv, tile_plane_size);
|
||||||
align_buffer_page_end(dst_u_c, uv_plane_size);
|
align_buffer_page_end(dst_u_c, uv_plane_size);
|
||||||
align_buffer_page_end(dst_u_opt, uv_plane_size);
|
align_buffer_page_end(dst_u_opt, uv_plane_size);
|
||||||
align_buffer_page_end(dst_v_c, uv_plane_size);
|
align_buffer_page_end(dst_v_c, uv_plane_size);
|
||||||
align_buffer_page_end(dst_v_opt, uv_plane_size);
|
align_buffer_page_end(dst_v_opt, uv_plane_size);
|
||||||
|
|
||||||
MemRandomize(orig_uv, orig_plane_size);
|
MemRandomize(tile_uv, tile_plane_size);
|
||||||
memset(dst_u_c, 0, uv_plane_size);
|
memset(dst_u_c, 0, uv_plane_size);
|
||||||
memset(dst_u_opt, 0, uv_plane_size);
|
memset(dst_u_opt, 0, uv_plane_size);
|
||||||
memset(dst_v_c, 0, uv_plane_size);
|
memset(dst_v_c, 0, uv_plane_size);
|
||||||
@ -1746,7 +1785,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
|
|||||||
// Disable all optimizations.
|
// Disable all optimizations.
|
||||||
MaskCpuFlags(disable_cpu_flags_);
|
MaskCpuFlags(disable_cpu_flags_);
|
||||||
|
|
||||||
DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2,
|
DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2,
|
||||||
dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
|
dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
|
||||||
benchmark_height_, 16);
|
benchmark_height_, 16);
|
||||||
|
|
||||||
@ -1755,7 +1794,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
|
|||||||
|
|
||||||
for (j = 0; j < benchmark_iterations_; j++) {
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
DetileSplitUVPlane(
|
DetileSplitUVPlane(
|
||||||
orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
|
tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
|
||||||
(benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
|
(benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1764,7 +1803,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
|
|||||||
EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
|
EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_aligned_buffer_page_end(orig_uv);
|
free_aligned_buffer_page_end(tile_uv);
|
||||||
free_aligned_buffer_page_end(dst_u_c);
|
free_aligned_buffer_page_end(dst_u_c);
|
||||||
free_aligned_buffer_page_end(dst_u_opt);
|
free_aligned_buffer_page_end(dst_u_opt);
|
||||||
free_aligned_buffer_page_end(dst_v_c);
|
free_aligned_buffer_page_end(dst_v_c);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user