mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Add support for MM21.
Add support for MM21 to NV12 and I420 conversion, and add SIMD optimizations for arm, aarch64, SSE2, and SSSE3 machines. Bug: libyuv:915, b/215425056 Change-Id: Iecb0c33287f35766a6169d4adf3b7397f1ba8b5d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3433269 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
804980bbab
commit
b4ddbaf549
@ -1,8 +1,8 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1810
|
Version: 1811
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
Description:
|
Description:
|
||||||
libyuv is an open source project that includes YUV conversion and scaling functionality.
|
libyuv is an open source project that includes YUV conversion and scaling functionality.
|
||||||
|
|||||||
@ -106,6 +106,34 @@ int I422ToI444(const uint8_t* src_y,
|
|||||||
int width,
|
int width,
|
||||||
int height);
|
int height);
|
||||||
|
|
||||||
|
// Convert MM21 to NV12.
|
||||||
|
LIBYUV_API
|
||||||
|
int MM21ToNV12(const uint8_t* src_y,
|
||||||
|
int src_stride_y,
|
||||||
|
const uint8_t* src_uv,
|
||||||
|
int src_stride_uv,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int dst_stride_y,
|
||||||
|
uint8_t* dst_uv,
|
||||||
|
int dst_stride_uv,
|
||||||
|
int width,
|
||||||
|
int height);
|
||||||
|
|
||||||
|
// Convert MM21 to I420.
|
||||||
|
LIBYUV_API
|
||||||
|
int MM21ToI420(const uint8_t* src_y,
|
||||||
|
int src_stride_y,
|
||||||
|
const uint8_t* src_uv,
|
||||||
|
int src_stride_uv,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int dst_stride_y,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
int dst_stride_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int dst_stride_v,
|
||||||
|
int width,
|
||||||
|
int height);
|
||||||
|
|
||||||
// Convert I422 to NV21.
|
// Convert I422 to NV21.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
int I422ToNV21(const uint8_t* src_y,
|
int I422ToNV21(const uint8_t* src_y,
|
||||||
|
|||||||
@ -93,6 +93,18 @@ void DetilePlane(const uint8_t* src_y,
|
|||||||
int height,
|
int height,
|
||||||
int tile_height);
|
int tile_height);
|
||||||
|
|
||||||
|
// Convert a UV plane of tiles of 16 x H into linear U and V planes.
|
||||||
|
LIBYUV_API
|
||||||
|
void DetileSplitUVPlane(const uint8_t* src_uv,
|
||||||
|
int src_stride_uv,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
int dst_stride_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int dst_stride_v,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
int tile_height);
|
||||||
|
|
||||||
// Split interleaved UV plane into separate U and V planes.
|
// Split interleaved UV plane into separate U and V planes.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
void SplitUVPlane(const uint8_t* src_uv,
|
void SplitUVPlane(const uint8_t* src_uv,
|
||||||
|
|||||||
@ -290,6 +290,8 @@ extern "C" {
|
|||||||
#define HAS_AB64TOARGBROW_SSSE3
|
#define HAS_AB64TOARGBROW_SSSE3
|
||||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||||
#define HAS_CONVERT8TO16ROW_SSE2
|
#define HAS_CONVERT8TO16ROW_SSE2
|
||||||
|
#define HAS_DETILEROW_SSE2
|
||||||
|
#define HAS_DETILESPLITUVROW_SSSE3
|
||||||
#define HAS_HALFMERGEUVROW_SSSE3
|
#define HAS_HALFMERGEUVROW_SSSE3
|
||||||
#define HAS_I210TOAR30ROW_SSSE3
|
#define HAS_I210TOAR30ROW_SSSE3
|
||||||
#define HAS_I210TOARGBROW_SSSE3
|
#define HAS_I210TOARGBROW_SSSE3
|
||||||
@ -537,6 +539,7 @@ extern "C" {
|
|||||||
#define HAS_GAUSSROW_F32_NEON
|
#define HAS_GAUSSROW_F32_NEON
|
||||||
#define HAS_GAUSSCOL_F32_NEON
|
#define HAS_GAUSSCOL_F32_NEON
|
||||||
#define HAS_DETILEROW_NEON
|
#define HAS_DETILEROW_NEON
|
||||||
|
#define HAS_DETILESPLITUVROW_NEON
|
||||||
#endif
|
#endif
|
||||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||||
#define HAS_ABGRTOUVROW_MSA
|
#define HAS_ABGRTOUVROW_MSA
|
||||||
@ -1839,6 +1842,43 @@ void DetileRow_NEON(const uint8_t* src,
|
|||||||
ptrdiff_t src_tile_stride,
|
ptrdiff_t src_tile_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int width);
|
int width);
|
||||||
|
void DetileRow_Any_NEON(const uint8_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileRow_SSE2(const uint8_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileRow_Any_SSE2(const uint8_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width);
|
||||||
|
void DetileSplitUVRow_C(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
|
void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
|
void DetileSplitUVRow_Any_SSSE3(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
|
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
|
void DetileSplitUVRow_Any_NEON(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width);
|
||||||
void MergeUVRow_C(const uint8_t* src_u,
|
void MergeUVRow_C(const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
uint8_t* dst_uv,
|
uint8_t* dst_uv,
|
||||||
|
|||||||
@ -564,6 +564,60 @@ int I422ToNV21(const uint8_t* src_y,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LIBYUV_API
|
||||||
|
int MM21ToNV12(const uint8_t* src_y,
|
||||||
|
int src_stride_y,
|
||||||
|
const uint8_t* src_uv,
|
||||||
|
int src_stride_uv,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int dst_stride_y,
|
||||||
|
uint8_t* dst_uv,
|
||||||
|
int dst_stride_uv,
|
||||||
|
int width,
|
||||||
|
int height) {
|
||||||
|
if (!src_uv || !dst_uv || width <= 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int sign = height < 0 ? -1 : 1;
|
||||||
|
|
||||||
|
if (dst_y) {
|
||||||
|
DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
|
||||||
|
}
|
||||||
|
DetilePlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, (width + 1) & ~1,
|
||||||
|
(height + sign) / 2, 16);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
LIBYUV_API
|
||||||
|
int MM21ToI420(const uint8_t* src_y,
|
||||||
|
int src_stride_y,
|
||||||
|
const uint8_t* src_uv,
|
||||||
|
int src_stride_uv,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int dst_stride_y,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
int dst_stride_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int dst_stride_v,
|
||||||
|
int width,
|
||||||
|
int height) {
|
||||||
|
int sign = height < 0 ? -1 : 1;
|
||||||
|
|
||||||
|
if (!src_uv || !dst_u || !dst_v || width <= 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dst_y) {
|
||||||
|
DetilePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, 32);
|
||||||
|
}
|
||||||
|
DetileSplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
|
||||||
|
dst_stride_v, (width + 1) & ~1, (height + sign) / 2, 16);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef I422TONV21_ROW_VERSION
|
#ifdef I422TONV21_ROW_VERSION
|
||||||
// Unittest fails for this version.
|
// Unittest fails for this version.
|
||||||
// 422 chroma is 1/2 width, 1x height
|
// 422 chroma is 1/2 width, 1x height
|
||||||
|
|||||||
@ -882,9 +882,20 @@ void DetilePlane(const uint8_t* src_y,
|
|||||||
dst_stride_y = -dst_stride_y;
|
dst_stride_y = -dst_stride_y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(HAS_DETILEROW_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
DetileRow = DetileRow_Any_SSE2;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
DetileRow = DetileRow_SSE2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_DETILEROW_NEON)
|
#if defined(HAS_DETILEROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
DetileRow = DetileRow_NEON;
|
DetileRow = DetileRow_Any_NEON;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
DetileRow = DetileRow_NEON;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -900,6 +911,64 @@ void DetilePlane(const uint8_t* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LIBYUV_API
|
||||||
|
void DetileSplitUVPlane(const uint8_t* src_uv,
|
||||||
|
int src_stride_uv,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
int dst_stride_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int dst_stride_v,
|
||||||
|
int width,
|
||||||
|
int height,
|
||||||
|
int tile_height) {
|
||||||
|
const ptrdiff_t src_tile_stride = 16 * tile_height;
|
||||||
|
int y;
|
||||||
|
void (*DetileSplitUVRow)(const uint8_t* src, ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u, uint8_t* dst_v, int width) =
|
||||||
|
DetileSplitUVRow_C;
|
||||||
|
assert(src_stride_uv >= 0);
|
||||||
|
assert(tile_height > 0);
|
||||||
|
assert(src_stride_uv > 0);
|
||||||
|
|
||||||
|
// Negative height means invert the image.
|
||||||
|
if (height < 0) {
|
||||||
|
height = -height;
|
||||||
|
dst_u = dst_u + (height - 1) * dst_stride_u;
|
||||||
|
dst_stride_u = -dst_stride_u;
|
||||||
|
dst_v = dst_v + (height - 1) * dst_stride_v;
|
||||||
|
dst_stride_v = -dst_stride_v;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(HAS_DETILESPLITUVROW_SSSE3)
|
||||||
|
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||||
|
DetileSplitUVRow = DetileSplitUVRow_Any_SSSE3;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
DetileSplitUVRow = DetileSplitUVRow_SSSE3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_DETILESPLITROW_NEON)
|
||||||
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
|
DetileSplitUVRow = DetileSplitUVRow_Any_NEON;
|
||||||
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
DetileSplitUVRow = DetileSplitUVRow_NEON;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Detile plane
|
||||||
|
for (y = 0; y < height; ++y) {
|
||||||
|
DetileSplitUVRow(src_uv, src_tile_stride, dst_u, dst_v, width);
|
||||||
|
dst_u += dst_stride_u;
|
||||||
|
dst_v += dst_stride_v;
|
||||||
|
src_uv += 16;
|
||||||
|
// Advance to next row of tiles.
|
||||||
|
if ((y & (tile_height - 1)) == (tile_height - 1)) {
|
||||||
|
src_uv = src_uv - src_tile_stride + src_stride_uv * tile_height;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Support function for NV12 etc RGB channels.
|
// Support function for NV12 etc RGB channels.
|
||||||
// Width and height are plane sizes (typically half pixel width).
|
// Width and height are plane sizes (typically half pixel width).
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
|
|||||||
@ -2059,6 +2059,51 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
|
|||||||
#endif
|
#endif
|
||||||
#undef ANY11S
|
#undef ANY11S
|
||||||
|
|
||||||
|
#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \
|
||||||
|
void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
|
||||||
|
int width) { \
|
||||||
|
SIMD_ALIGNED(uint8_t temp[16 * 2]); \
|
||||||
|
memset(temp, 0, 16); /* for msan */ \
|
||||||
|
int r = width & MASK; \
|
||||||
|
int n = width & ~MASK; \
|
||||||
|
if (n > 0) { \
|
||||||
|
ANY_SIMD(src, src_tile_stride, dst, n); \
|
||||||
|
} \
|
||||||
|
memcpy(temp, src + (n / 16) * src_tile_stride, r); \
|
||||||
|
ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \
|
||||||
|
memcpy(dst + n, temp + 16, r); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HAS_DETILEROW_NEON
|
||||||
|
ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_DETILEROW_SSE2
|
||||||
|
ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \
|
||||||
|
void NAMEANY(const uint8_t* src_uv, ptrdiff_t src_tile_stride, \
|
||||||
|
uint8_t* dst_u, uint8_t* dst_v, int width) { \
|
||||||
|
SIMD_ALIGNED(uint8_t temp[16 * 2]); \
|
||||||
|
memset(temp, 0, 16 * 2); /* for msan */ \
|
||||||
|
int r = width & MASK; \
|
||||||
|
int n = width & ~MASK; \
|
||||||
|
if (n > 0) { \
|
||||||
|
ANY_SIMD(src_uv, src_tile_stride, dst_u, dst_v, n); \
|
||||||
|
} \
|
||||||
|
memcpy(temp, src_uv + (n / 16) * src_tile_stride, r); \
|
||||||
|
ANY_SIMD(temp, src_tile_stride, temp + 16, temp + 24, r); \
|
||||||
|
memcpy(dst_u + n / 2, temp + 16, (r + 1) / 2); \
|
||||||
|
memcpy(dst_v + n / 2, temp + 24, (r + 1) / 2); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef HAS_DETILESPLITUVROW_NEON
|
||||||
|
ANYDETILESPLITUV(DetileSplitUVRow_Any_NEON, DetileSplitUVRow_NEON, 15)
|
||||||
|
#endif
|
||||||
|
#ifdef HAS_DETILESPLITUVROW_SSSE3
|
||||||
|
ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
@ -2674,6 +2674,30 @@ void DetileRow_C(const uint8_t* src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DetileSplitUVRow_C(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
int tile;
|
||||||
|
for (tile = 0; tile < width / 16; tile++) {
|
||||||
|
for (int x = 0; x < 8; x++) {
|
||||||
|
*dst_u++ = src_uv[0];
|
||||||
|
*dst_v++ = src_uv[1];
|
||||||
|
src_uv += 2;
|
||||||
|
}
|
||||||
|
src_uv += src_tile_stride - 16;
|
||||||
|
}
|
||||||
|
for (int x = 0; x < (width & 0xF) / 2; ++x) {
|
||||||
|
*dst_u = *src_uv;
|
||||||
|
dst_u++;
|
||||||
|
src_uv++;
|
||||||
|
*dst_v = *src_uv;
|
||||||
|
dst_v++;
|
||||||
|
src_uv++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void SplitUVRow_C(const uint8_t* src_uv,
|
void SplitUVRow_C(const uint8_t* src_uv,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
uint8_t* dst_v,
|
uint8_t* dst_v,
|
||||||
|
|||||||
@ -9,7 +9,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "libyuv/row.h"
|
#include "libyuv/row.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
namespace libyuv {
|
namespace libyuv {
|
||||||
extern "C" {
|
extern "C" {
|
||||||
@ -4765,6 +4764,63 @@ void SplitUVRow_SSE2(const uint8_t* src_uv,
|
|||||||
}
|
}
|
||||||
#endif // HAS_SPLITUVROW_SSE2
|
#endif // HAS_SPLITUVROW_SSE2
|
||||||
|
|
||||||
|
#ifdef HAS_DETILEROW_SSE2
|
||||||
|
void DetileRow_SSE2(const uint8_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"movdqu (%0),%%xmm0 \n"
|
||||||
|
"sub $0x10,%2 \n"
|
||||||
|
"lea (%0,%3),%0 \n"
|
||||||
|
"movdqu %%xmm0,(%1) \n"
|
||||||
|
"lea 0x10(%1),%1 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(src_tile_stride) // %3
|
||||||
|
: "cc", "memory", "xmm0");
|
||||||
|
}
|
||||||
|
#endif // HAS_DETILEROW_SSE2
|
||||||
|
|
||||||
|
#ifdef HAS_DETILESPLITUVROW_SSSE3
|
||||||
|
// TODO(greenjustin): Look into generating these constants instead of loading
|
||||||
|
// them since this can cause branch mispredicts for fPIC code on 32-bit
|
||||||
|
// machines.
|
||||||
|
static const uvec8 kDeinterlaceUV = {0, 2, 4, 6, 8, 10, 12, 14,
|
||||||
|
1, 3, 5, 7, 9, 11, 13, 15};
|
||||||
|
|
||||||
|
// TODO(greenjustin): Research alternatives to pshufb, since pshufb can be very
|
||||||
|
// slow on older SSE2 processors.
|
||||||
|
void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"movdqu %4,%%xmm1 \n"
|
||||||
|
"1: \n"
|
||||||
|
"movdqu (%0),%%xmm0 \n"
|
||||||
|
"lea (%0, %5),%0 \n"
|
||||||
|
"pshufb %%xmm1,%%xmm0 \n"
|
||||||
|
"movq %%xmm0,(%1) \n"
|
||||||
|
"lea 0x8(%1),%1 \n"
|
||||||
|
"movhps %%xmm0,(%2) \n"
|
||||||
|
"lea 0x8(%2),%2 \n"
|
||||||
|
"sub $0x10,%3 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
: "+r"(src_uv), // %0
|
||||||
|
"+r"(dst_u), // %1
|
||||||
|
"+r"(dst_v), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
: "m"(kDeinterlaceUV), // %4
|
||||||
|
"r"(src_tile_stride) // %5
|
||||||
|
: "cc", "memory", "xmm0", "xmm1");
|
||||||
|
}
|
||||||
|
#endif // HAS_DETILESPLITUVROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_MERGEUVROW_AVX2
|
#ifdef HAS_MERGEUVROW_AVX2
|
||||||
void MergeUVRow_AVX2(const uint8_t* src_u,
|
void MergeUVRow_AVX2(const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
|
|||||||
@ -575,6 +575,52 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reads 16 byte Y's from tile and writes out 16 Y's.
|
||||||
|
// MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
|
||||||
|
// MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
|
||||||
|
// width measured in bytes so 8 UV = 16.
|
||||||
|
void DetileRow_NEON(const uint8_t* src,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"vld1.16 {q0}, [%0], %3 \n" // load 16 bytes
|
||||||
|
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||||
|
"pld [%0, 1792] \n"
|
||||||
|
"vst1.16 {q0}, [%1]! \n" // store 16 bytes
|
||||||
|
"bgt 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
: "r"(src_tile_stride) // %3
|
||||||
|
: "cc", "memory", "q0" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
|
||||||
|
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"vld2.8 {d0, d1}, [%0], %4 \n"
|
||||||
|
"subs %3, %3, #16 \n"
|
||||||
|
"pld [%0, 1792] \n"
|
||||||
|
"vst1.8 {d0}, [%1]! \n"
|
||||||
|
"vst1.8 {d1}, [%2]! \n"
|
||||||
|
"bgt 1b \n"
|
||||||
|
: "+r"(src_uv), // %0
|
||||||
|
"+r"(dst_u), // %1
|
||||||
|
"+r"(dst_v), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
: "r"(src_tile_stride) // %4
|
||||||
|
: "cc", "memory", "d0", "d1" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
||||||
void MergeUVRow_NEON(const uint8_t* src_u,
|
void MergeUVRow_NEON(const uint8_t* src_u,
|
||||||
const uint8_t* src_v,
|
const uint8_t* src_v,
|
||||||
|
|||||||
@ -627,6 +627,29 @@ void DetileRow_NEON(const uint8_t* src,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
|
||||||
|
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
|
||||||
|
ptrdiff_t src_tile_stride,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ld2 {v0.8b,v1.8b}, [%0], %4 \n"
|
||||||
|
"subs %w3, %w3, #16 \n"
|
||||||
|
"prfm pldl1keep, [%0, 1792] \n"
|
||||||
|
"st1 {v0.8b}, [%1], #8 \n"
|
||||||
|
"st1 {v1.8b}, [%2], #8 \n"
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src_uv), // %0
|
||||||
|
"+r"(dst_u), // %1
|
||||||
|
"+r"(dst_v), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
: "r"(src_tile_stride) // %4
|
||||||
|
: "cc", "memory", "v0", "v1" // Clobber List
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#if LIBYUV_USE_ST2
|
#if LIBYUV_USE_ST2
|
||||||
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
// Reads 16 U's and V's and writes out 16 pairs of UV.
|
||||||
void MergeUVRow_NEON(const uint8_t* src_u,
|
void MergeUVRow_NEON(const uint8_t* src_u,
|
||||||
|
|||||||
@ -419,7 +419,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
|
|||||||
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
|
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
|
||||||
DOY, SRC_DEPTH) \
|
DOY, SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||||
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
|
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
|
||||||
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
|
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
|
||||||
@ -433,13 +433,18 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
|
|||||||
"DST_SUBSAMP_Y unsupported"); \
|
"DST_SUBSAMP_Y unsupported"); \
|
||||||
const int kWidth = W1280; \
|
const int kWidth = W1280; \
|
||||||
const int kHeight = benchmark_height_; \
|
const int kHeight = benchmark_height_; \
|
||||||
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
|
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
|
||||||
const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
|
|
||||||
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
|
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
|
||||||
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
|
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
|
||||||
align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
|
const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
|
||||||
align_buffer_page_end(src_uv, \
|
const int kPaddedHeight = \
|
||||||
2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \
|
(kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \
|
||||||
|
const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \
|
||||||
|
const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
|
||||||
|
align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
|
||||||
|
align_buffer_page_end( \
|
||||||
|
src_uv, \
|
||||||
|
2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF); \
|
||||||
align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
|
align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
|
||||||
align_buffer_page_end(dst_uv_c, \
|
align_buffer_page_end(dst_uv_c, \
|
||||||
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
||||||
@ -448,11 +453,11 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
|
|||||||
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
||||||
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
|
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
|
||||||
SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
|
SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
|
||||||
for (int i = 0; i < kWidth * kHeight; ++i) { \
|
for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \
|
||||||
src_y_p[i] = \
|
src_y_p[i] = \
|
||||||
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
||||||
} \
|
} \
|
||||||
for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \
|
for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
|
||||||
src_uv_p[i] = \
|
src_uv_p[i] = \
|
||||||
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
||||||
} \
|
} \
|
||||||
@ -497,136 +502,148 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
|
|||||||
|
|
||||||
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
|
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, \
|
||||||
|
TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1, \
|
DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1, \
|
||||||
SRC_DEPTH) \
|
SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1, \
|
DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1, \
|
||||||
SRC_DEPTH) \
|
SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \
|
DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \
|
||||||
SRC_DEPTH) \
|
SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \
|
DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH, \
|
||||||
|
TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \
|
DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \
|
||||||
SRC_DEPTH)
|
SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT)
|
||||||
|
|
||||||
TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
|
TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1)
|
||||||
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8)
|
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8, 1, 1)
|
||||||
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8)
|
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
|
||||||
TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8)
|
TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8, 1, 1)
|
||||||
TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10)
|
TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10, 1, 1)
|
||||||
TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10)
|
TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10, 1, 1)
|
||||||
TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10)
|
TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10, 1, 1)
|
||||||
TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12)
|
TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12, 1, 1)
|
||||||
TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12)
|
TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12, 1, 1)
|
||||||
TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12)
|
TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12, 1, 1)
|
||||||
|
TESTBIPLANARTOBP(MM21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 16, 32)
|
||||||
|
|
||||||
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
|
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
|
||||||
SRC_DEPTH) \
|
SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||||
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
|
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
|
||||||
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
|
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
|
||||||
static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
|
static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
|
||||||
"SRC_SUBSAMP_X unsupported"); \
|
"SRC_SUBSAMP_X unsupported"); \
|
||||||
static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
|
static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
|
||||||
"SRC_SUBSAMP_Y unsupported"); \
|
"SRC_SUBSAMP_Y unsupported"); \
|
||||||
static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
|
static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
|
||||||
"DST_SUBSAMP_X unsupported"); \
|
"DST_SUBSAMP_X unsupported"); \
|
||||||
static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
|
static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
|
||||||
"DST_SUBSAMP_Y unsupported"); \
|
"DST_SUBSAMP_Y unsupported"); \
|
||||||
const int kWidth = W1280; \
|
const int kWidth = W1280; \
|
||||||
const int kHeight = benchmark_height_; \
|
const int kHeight = benchmark_height_; \
|
||||||
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
|
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
|
||||||
const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
|
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
|
||||||
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
|
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
|
||||||
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
|
const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \
|
||||||
align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
|
const int kPaddedHeight = \
|
||||||
align_buffer_page_end(src_uv, \
|
(kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \
|
||||||
kSrcHalfWidth* kSrcHalfHeight* SRC_BPC * 2 + OFF); \
|
const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \
|
||||||
align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
|
const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \
|
||||||
align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \
|
||||||
align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
align_buffer_page_end( \
|
||||||
align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
|
src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \
|
||||||
align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
|
||||||
align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
|
align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
|
align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
|
||||||
for (int i = 0; i < kWidth * kHeight; ++i) { \
|
align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
src_y_p[i] = \
|
align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
|
||||||
} \
|
SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
|
||||||
for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \
|
for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \
|
||||||
src_uv_p[i] = \
|
src_y_p[i] = \
|
||||||
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
||||||
} \
|
} \
|
||||||
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
|
for (int i = 0; i < kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * 2; ++i) { \
|
||||||
memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
src_uv_p[i] = \
|
||||||
memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
(fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \
|
||||||
memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
|
} \
|
||||||
memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
|
||||||
memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
MaskCpuFlags(disable_cpu_flags_); \
|
memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
|
||||||
src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \
|
memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
|
memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||||
reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
|
MaskCpuFlags(disable_cpu_flags_); \
|
||||||
reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
|
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||||
NEG kHeight); \
|
src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \
|
||||||
MaskCpuFlags(benchmark_cpu_info_); \
|
reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
|
||||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
|
||||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
|
||||||
src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \
|
NEG kHeight); \
|
||||||
reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
|
MaskCpuFlags(benchmark_cpu_info_); \
|
||||||
reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
|
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||||
reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
|
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||||
NEG kHeight); \
|
src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \
|
||||||
} \
|
reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
|
||||||
for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
|
reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
|
||||||
EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
|
reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
|
||||||
} \
|
NEG kHeight); \
|
||||||
for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \
|
} \
|
||||||
EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \
|
for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \
|
||||||
EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \
|
EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \
|
||||||
} \
|
} \
|
||||||
free_aligned_buffer_page_end(dst_y_c); \
|
for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \
|
||||||
free_aligned_buffer_page_end(dst_u_c); \
|
EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \
|
||||||
free_aligned_buffer_page_end(dst_v_c); \
|
EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \
|
||||||
free_aligned_buffer_page_end(dst_y_opt); \
|
} \
|
||||||
free_aligned_buffer_page_end(dst_u_opt); \
|
free_aligned_buffer_page_end(dst_y_c); \
|
||||||
free_aligned_buffer_page_end(dst_v_opt); \
|
free_aligned_buffer_page_end(dst_u_c); \
|
||||||
free_aligned_buffer_page_end(src_y); \
|
free_aligned_buffer_page_end(dst_v_c); \
|
||||||
free_aligned_buffer_page_end(src_uv); \
|
free_aligned_buffer_page_end(dst_y_opt); \
|
||||||
|
free_aligned_buffer_page_end(dst_u_opt); \
|
||||||
|
free_aligned_buffer_page_end(dst_v_opt); \
|
||||||
|
free_aligned_buffer_page_end(src_y); \
|
||||||
|
free_aligned_buffer_page_end(src_uv); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
#define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
|
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH, TILE_WIDTH, \
|
||||||
|
TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \
|
DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH, \
|
||||||
|
TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \
|
DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \
|
||||||
SRC_DEPTH) \
|
SRC_DEPTH, TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \
|
DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH, \
|
||||||
|
TILE_WIDTH, TILE_HEIGHT) \
|
||||||
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||||
DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH)
|
DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH, \
|
||||||
|
TILE_WIDTH, TILE_HEIGHT)
|
||||||
|
|
||||||
TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
|
TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
|
||||||
TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
|
TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 1, 1)
|
||||||
|
TESTBIPLANARTOP(MM21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8, 16, 32)
|
||||||
|
|
||||||
// Provide matrix wrappers for full range bt.709
|
// Provide matrix wrappers for full range bt.709
|
||||||
#define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
|
#define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
|
||||||
|
|||||||
101
unit_test/planar_test.cc
Executable file → Normal file
101
unit_test/planar_test.cc
Executable file → Normal file
@ -1523,6 +1523,107 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
|
|||||||
free_aligned_buffer_page_end(dst_opt);
|
free_aligned_buffer_page_end(dst_opt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
// orig is tiled. Allocate enough memory for tiles.
|
||||||
|
int orig_width = (benchmark_width_ + 15) & ~15;
|
||||||
|
int orig_height = (benchmark_height_ + 15) & ~15;
|
||||||
|
int orig_plane_size = orig_width * orig_height;
|
||||||
|
int u_plane_size = benchmark_width_ * benchmark_height_;
|
||||||
|
int v_plane_size = u_plane_size;
|
||||||
|
align_buffer_page_end(orig_uv, orig_plane_size);
|
||||||
|
align_buffer_page_end(dst_u_c, u_plane_size);
|
||||||
|
align_buffer_page_end(dst_u_opt, u_plane_size);
|
||||||
|
align_buffer_page_end(dst_v_c, v_plane_size);
|
||||||
|
align_buffer_page_end(dst_v_opt, v_plane_size);
|
||||||
|
|
||||||
|
MemRandomize(orig_uv, orig_plane_size);
|
||||||
|
memset(dst_u_c, 0, u_plane_size);
|
||||||
|
memset(dst_u_opt, 0, u_plane_size);
|
||||||
|
memset(dst_v_c, 0, v_plane_size);
|
||||||
|
memset(dst_v_opt, 0, v_plane_size);
|
||||||
|
|
||||||
|
// Disable all optimizations.
|
||||||
|
MaskCpuFlags(disable_cpu_flags_);
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2,
|
||||||
|
dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
|
||||||
|
benchmark_height_, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Enable optimizations.
|
||||||
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
DetileSplitUVPlane(
|
||||||
|
orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
|
||||||
|
(benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < u_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(dst_u_c[i], dst_u_opt[i]);
|
||||||
|
}
|
||||||
|
for (i = 0; i < v_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_aligned_buffer_page_end(orig_uv);
|
||||||
|
free_aligned_buffer_page_end(dst_u_c);
|
||||||
|
free_aligned_buffer_page_end(dst_u_opt);
|
||||||
|
free_aligned_buffer_page_end(dst_v_c);
|
||||||
|
free_aligned_buffer_page_end(dst_v_opt);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
// orig is tiled. Allocate enough memory for tiles.
|
||||||
|
int orig_width = (benchmark_width_ + 15) & ~15;
|
||||||
|
int orig_height = (benchmark_height_ + 15) & ~15;
|
||||||
|
int orig_plane_size = orig_width * orig_height;
|
||||||
|
int u_plane_size = benchmark_width_ * benchmark_height_;
|
||||||
|
int v_plane_size = u_plane_size;
|
||||||
|
align_buffer_page_end(orig_uv, orig_plane_size);
|
||||||
|
align_buffer_page_end(detiled_uv, orig_plane_size);
|
||||||
|
align_buffer_page_end(dst_u_two_stage, u_plane_size);
|
||||||
|
align_buffer_page_end(dst_u_opt, u_plane_size);
|
||||||
|
align_buffer_page_end(dst_v_two_stage, v_plane_size);
|
||||||
|
align_buffer_page_end(dst_v_opt, v_plane_size);
|
||||||
|
|
||||||
|
MemRandomize(orig_uv, orig_plane_size);
|
||||||
|
memset(detiled_uv, 0, orig_plane_size);
|
||||||
|
memset(dst_u_two_stage, 0, u_plane_size);
|
||||||
|
memset(dst_u_opt, 0, u_plane_size);
|
||||||
|
memset(dst_v_two_stage, 0, v_plane_size);
|
||||||
|
memset(dst_v_opt, 0, v_plane_size);
|
||||||
|
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
DetileSplitUVPlane(
|
||||||
|
orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
|
||||||
|
(benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
|
||||||
|
}
|
||||||
|
|
||||||
|
DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_,
|
||||||
|
benchmark_width_, benchmark_height_, 16);
|
||||||
|
SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage,
|
||||||
|
(benchmark_width_ + 1) / 2, dst_v_two_stage,
|
||||||
|
(benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_);
|
||||||
|
|
||||||
|
for (i = 0; i < u_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(dst_u_two_stage[i], dst_u_opt[i]);
|
||||||
|
}
|
||||||
|
for (i = 0; i < v_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_aligned_buffer_page_end(orig_uv);
|
||||||
|
free_aligned_buffer_page_end(detiled_uv);
|
||||||
|
free_aligned_buffer_page_end(dst_u_two_stage);
|
||||||
|
free_aligned_buffer_page_end(dst_u_opt);
|
||||||
|
free_aligned_buffer_page_end(dst_v_two_stage);
|
||||||
|
free_aligned_buffer_page_end(dst_v_opt);
|
||||||
|
}
|
||||||
|
|
||||||
static int TestMultiply(int width,
|
static int TestMultiply(int width,
|
||||||
int height,
|
int height,
|
||||||
int benchmark_iterations,
|
int benchmark_iterations,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user