Add Detile_16 planar function for 10 bit MT2T format

- Neon and SSE2
- Any for odd widths

Pixel 2 little core AArch32 build
C
TestDetilePlane_16 (1275 ms)
TestDetilePlane (1203 ms)
Neon
TestDetilePlane_16 (693 ms)
TestDetilePlane (660 ms)

Bug: b/258474032
Change-Id: Idbd09c5e9324e4deef5f1d54090d4b63cc7db812
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4031848
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-11-16 18:02:34 -08:00 committed by libyuv LUCI CQ
parent 6f21862f1b
commit 2d2cee418a
11 changed files with 279 additions and 67 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1848
Version: 1849
License: BSD
License File: LICENSE

View File

@ -85,7 +85,7 @@ void SetPlane(uint8_t* dst_y,
// Convert a plane of tiles of 16 x H to linear.
LIBYUV_API
void DetilePlane(const uint8_t* src_y,
int DetilePlane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
@ -93,6 +93,16 @@ void DetilePlane(const uint8_t* src_y,
int height,
int tile_height);
// Convert a plane of 16 bit tiles of 16 x H to linear.
LIBYUV_API
int DetilePlane_16(const uint16_t* src_y,
int src_stride_y,
uint16_t* dst_y,
int dst_stride_y,
int width,
int height,
int tile_height);
// Convert a UV plane of tiles of 16 x H into linear U and V planes.
LIBYUV_API
void DetileSplitUVPlane(const uint8_t* src_uv,
@ -106,6 +116,7 @@ void DetileSplitUVPlane(const uint8_t* src_uv,
int tile_height);
// Convert a Y and UV plane of tiles into interlaced YUY2.
LIBYUV_API
void DetileToYUY2(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
@ -382,6 +393,7 @@ int I210Copy(const uint16_t* src_y,
int height);
// Copy NV12. Supports inverting.
LIBYUV_API
int NV12Copy(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
@ -394,6 +406,7 @@ int NV12Copy(const uint8_t* src_y,
int height);
// Copy NV21. Supports inverting.
LIBYUV_API
int NV21Copy(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_vu,

View File

@ -290,6 +290,7 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
#define HAS_DETILEROW_SSE2
#define HAS_DETILEROW_16_SSE2
#define HAS_DETILESPLITUVROW_SSSE3
#define HAS_DETILETOYUY2_SSE2
#define HAS_HALFMERGEUVROW_SSSE3
@ -449,6 +450,7 @@ extern "C" {
#define HAS_BYTETOFLOATROW_NEON
#define HAS_CONVERT16TO8ROW_NEON
#define HAS_COPYROW_NEON
#define HAS_DETILEROW_16_NEON
#define HAS_DETILEROW_NEON
#define HAS_DETILESPLITUVROW_NEON
#define HAS_DETILETOYUY2_NEON
@ -823,6 +825,7 @@ struct YuvConstants {
#endif
#define IS_POWEROFTWO(x) (!((x) & ((x) - 1)))
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
#define align_buffer_64(var, size) \
@ -2012,7 +2015,6 @@ void DetileRow_C(const uint8_t* src,
ptrdiff_t src_tile_stride,
uint8_t* dst,
int width);
void DetileRow_NEON(const uint8_t* src,
ptrdiff_t src_tile_stride,
uint8_t* dst,
@ -2029,6 +2031,26 @@ void DetileRow_Any_SSE2(const uint8_t* src,
ptrdiff_t src_tile_stride,
uint8_t* dst,
int width);
void DetileRow_16_C(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width);
void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width);
void DetileRow_16_Any_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width);
void DetileRow_16_SSE2(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width);
void DetileRow_16_Any_SSE2(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width);
void DetileSplitUVRow_C(const uint8_t* src_uv,
ptrdiff_t src_tile_stride,
uint8_t* dst_u,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1848
#define LIBYUV_VERSION 1849
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -385,6 +385,7 @@ int I420ToI400(const uint8_t* src_y,
}
// Copy NV12. Supports inverting.
LIBYUV_API
int NV12Copy(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
@ -418,6 +419,7 @@ int NV12Copy(const uint8_t* src_y,
}
// Copy NV21. Supports inverting.
LIBYUV_API
int NV21Copy(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_vu,
@ -916,9 +918,8 @@ int NV21ToNV12(const uint8_t* src_y,
// tile_height is 16 or 32 for MM21.
// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
// TODO: More detile row functions.
LIBYUV_API
void DetilePlane(const uint8_t* src_y,
int DetilePlane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
@ -929,13 +930,10 @@ void DetilePlane(const uint8_t* src_y,
int y;
void (*DetileRow)(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst,
int width) = DetileRow_C;
assert(src_stride_y >= 0);
assert(tile_height > 0);
assert(src_stride_y > 0);
if (width <= 0 || height == 0) {
return;
if (!src_y || !dst_y || width <= 0 || height == 0 || !IS_POWEROFTWO(tile_height)) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
@ -970,6 +968,63 @@ void DetilePlane(const uint8_t* src_y,
src_y = src_y - src_tile_stride + src_stride_y * tile_height;
}
}
return 0;
}
// Convert a plane of 16 bit tiles of 16 x H to linear.
// tile width is 16 and assumed.
// tile_height is 16 or 32 for MT2T.
LIBYUV_API
int DetilePlane_16(const uint16_t* src_y,
int src_stride_y,
uint16_t* dst_y,
int dst_stride_y,
int width,
int height,
int tile_height) {
const ptrdiff_t src_tile_stride = 16 * tile_height;
int y;
void (*DetileRow_16)(const uint16_t* src, ptrdiff_t src_tile_stride,
uint16_t* dst, int width) = DetileRow_16_C;
if (!src_y || !dst_y || width <= 0 || height == 0 || !IS_POWEROFTWO(tile_height)) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_stride_y = -dst_stride_y;
}
#if defined(HAS_DETILEROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
DetileRow_16 = DetileRow_16_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
DetileRow_16 = DetileRow_16_SSE2;
}
}
#endif
#if defined(HAS_DETILEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
DetileRow_16 = DetileRow_16_Any_NEON;
if (IS_ALIGNED(width, 16)) {
DetileRow_16 = DetileRow_16_NEON;
}
}
#endif
// Detile plane
for (y = 0; y < height; ++y) {
DetileRow_16(src_y, src_tile_stride, dst_y, width);
dst_y += dst_stride_y;
src_y += 16;
// Advance to next row of tiles.
if ((y & (tile_height - 1)) == (tile_height - 1)) {
src_y = src_y - src_tile_stride + src_stride_y * tile_height;
}
}
return 0;
}
LIBYUV_API

View File

@ -2242,26 +2242,31 @@ ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
#endif
#undef ANY11S
#define ANYDETILE(NAMEANY, ANY_SIMD, MASK) \
void NAMEANY(const uint8_t* src, ptrdiff_t src_tile_stride, uint8_t* dst, \
int width) { \
SIMD_ALIGNED(uint8_t temp[16 * 2]); \
memset(temp, 0, 16); /* for msan */ \
#define ANYDETILE(NAMEANY, ANY_SIMD, T, BPP, MASK) \
void NAMEANY(const T* src, ptrdiff_t src_tile_stride, T* dst, int width) { \
SIMD_ALIGNED(T temp[16 * 2]); \
memset(temp, 0, 16 * BPP); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src, src_tile_stride, dst, n); \
} \
memcpy(temp, src + (n / 16) * src_tile_stride, r); \
memcpy(temp, src + (n / 16) * src_tile_stride, r * BPP); \
ANY_SIMD(temp, src_tile_stride, temp + 16, MASK + 1); \
memcpy(dst + n, temp + 16, r); \
memcpy(dst + n, temp + 16, r * BPP); \
}
#ifdef HAS_DETILEROW_NEON
ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, 15)
ANYDETILE(DetileRow_Any_NEON, DetileRow_NEON, uint8_t, 1, 15)
#endif
#ifdef HAS_DETILEROW_SSE2
ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, 15)
ANYDETILE(DetileRow_Any_SSE2, DetileRow_SSE2, uint8_t, 1, 15)
#endif
#ifdef HAS_DETILEROW_16_NEON
ANYDETILE(DetileRow_16_Any_NEON, DetileRow_16_NEON, uint16_t, 2, 15)
#endif
#ifdef HAS_DETILEROW_16_SSE2
ANYDETILE(DetileRow_16_Any_SSE2, DetileRow_16_SSE2, uint16_t, 2, 15)
#endif
#define ANYDETILESPLITUV(NAMEANY, ANY_SIMD, MASK) \

View File

@ -2748,6 +2748,21 @@ void DetileRow_C(const uint8_t* src,
}
}
void DetileRow_16_C(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width) {
int x;
for (x = 0; x < width - 15; x += 16) {
memcpy(dst, src, 16 * sizeof(uint16_t));
dst += 16;
src += src_tile_stride;
}
if (width & 15) {
memcpy(dst, src, (width & 15) * sizeof(uint16_t));
}
}
void DetileSplitUVRow_C(const uint8_t* src_uv,
ptrdiff_t src_tile_stride,
uint8_t* dst_u,

View File

@ -5030,6 +5030,29 @@ void DetileRow_SSE2(const uint8_t* src,
}
#endif // HAS_DETILEROW_SSE2
#ifdef HAS_DETILEROW_16_SSE2
void DetileRow_16_SSE2(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width) {
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"lea (%0,%3,2),%0 \n"
"movdqu %%xmm0,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(src_tile_stride) // %3
: "cc", "memory", "xmm0", "xmm1");
}
#endif // HAS_DETILEROW_SSE2
#ifdef HAS_DETILETOYUY2_SSE2
// Read 16 Y, 8 UV, and write 8 YUYV.
void DetileToYUY2_SSE2(const uint8_t* src_y,

View File

@ -622,6 +622,26 @@ void DetileRow_NEON(const uint8_t* src,
);
}
// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width) {
asm volatile(
"1: \n"
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
"subs %2, %2, #16 \n" // 16 processed per loop
"pld [%0, #3584] \n"
"vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(src_tile_stride * 2) // %3
: "cc", "memory", "q0", "q1" // Clobber List
);
}
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
ptrdiff_t src_tile_stride,

View File

@ -650,6 +650,26 @@ void DetileRow_NEON(const uint8_t* src,
);
}
// Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride,
uint16_t* dst,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop
"prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead
"st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "r"(src_tile_stride * 2) // %3
: "cc", "memory", "v0", "v1" // Clobber List
);
}
// Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
void DetileSplitUVRow_NEON(const uint8_t* src_uv,
ptrdiff_t src_tile_stride,

View File

@ -1638,29 +1638,29 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
int i, j;
// orig is tiled. Allocate enough memory for tiles.
int orig_width = (benchmark_width_ + 15) & ~15;
int orig_height = (benchmark_height_ + 15) & ~15;
int orig_plane_size = orig_width * orig_height;
int tile_width = (benchmark_width_ + 15) & ~15;
int tile_height = (benchmark_height_ + 15) & ~15;
int tile_plane_size = tile_width * tile_height;
int y_plane_size = benchmark_width_ * benchmark_height_;
align_buffer_page_end(orig_y, orig_plane_size);
align_buffer_page_end(tile_y, tile_plane_size);
align_buffer_page_end(dst_c, y_plane_size);
align_buffer_page_end(dst_opt, y_plane_size);
MemRandomize(orig_y, orig_plane_size);
MemRandomize(tile_y, tile_plane_size);
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 0, y_plane_size);
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_,
DetilePlane(tile_y, tile_width, dst_c, benchmark_width_, benchmark_width_,
benchmark_height_, 16);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_);
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_,
DetilePlane(tile_y, tile_width, dst_opt, benchmark_width_, benchmark_width_,
benchmark_height_, 16);
}
@ -1668,7 +1668,46 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
EXPECT_EQ(dst_c[i], dst_opt[i]);
}
free_aligned_buffer_page_end(orig_y);
free_aligned_buffer_page_end(tile_y);
free_aligned_buffer_page_end(dst_c);
free_aligned_buffer_page_end(dst_opt);
}
TEST_F(LibYUVPlanarTest, TestDetilePlane_16) {
int i, j;
// orig is tiled. Allocate enough memory for tiles.
int tile_width = (benchmark_width_ + 15) & ~15;
int tile_height = (benchmark_height_ + 15) & ~15;
int tile_plane_size = tile_width * tile_height * 2;
int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
align_buffer_page_end(tile_y, tile_plane_size);
align_buffer_page_end(dst_c, y_plane_size);
align_buffer_page_end(dst_opt, y_plane_size);
MemRandomize(tile_y, tile_plane_size);
memset(dst_c, 0, y_plane_size);
memset(dst_opt, 0, y_plane_size);
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_c,
benchmark_width_, benchmark_width_, benchmark_height_, 16);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_);
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane_16((const uint16_t*)tile_y, tile_width, (uint16_t*)dst_opt,
benchmark_width_, benchmark_width_, benchmark_height_, 16);
}
for (i = 0; i < y_plane_size; ++i) {
EXPECT_EQ(dst_c[i], dst_opt[i]);
}
free_aligned_buffer_page_end(tile_y);
free_aligned_buffer_page_end(dst_c);
free_aligned_buffer_page_end(dst_opt);
}
@ -1678,33 +1717,33 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
int i, j;
// orig is tiled. Allocate enough memory for tiles.
int orig_width = (benchmark_width_ + 15) & ~15;
int orig_height = (benchmark_height_ + 15) & ~15;
int orig_plane_size = orig_width * orig_height;
int tile_width = (benchmark_width_ + 15) & ~15;
int tile_height = (benchmark_height_ + 15) & ~15;
int tile_plane_size = tile_width * tile_height;
int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
align_buffer_page_end(orig_uv, orig_plane_size);
align_buffer_page_end(detiled_uv, orig_plane_size);
align_buffer_page_end(tile_uv, tile_plane_size);
align_buffer_page_end(detiled_uv, tile_plane_size);
align_buffer_page_end(dst_u_two_stage, uv_plane_size);
align_buffer_page_end(dst_u_opt, uv_plane_size);
align_buffer_page_end(dst_v_two_stage, uv_plane_size);
align_buffer_page_end(dst_v_opt, uv_plane_size);
MemRandomize(orig_uv, orig_plane_size);
memset(detiled_uv, 0, orig_plane_size);
MemRandomize(tile_uv, tile_plane_size);
memset(detiled_uv, 0, tile_plane_size);
memset(dst_u_two_stage, 0, uv_plane_size);
memset(dst_u_opt, 0, uv_plane_size);
memset(dst_v_two_stage, 0, uv_plane_size);
memset(dst_v_opt, 0, uv_plane_size);
DetileSplitUVPlane(orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2,
DetileSplitUVPlane(tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2,
dst_v_opt, (benchmark_width_ + 1) / 2, benchmark_width_,
benchmark_height_, 16);
// Benchmark 2 step conversion for comparison.
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_uv, orig_width, detiled_uv, benchmark_width_,
DetilePlane(tile_uv, tile_width, detiled_uv, benchmark_width_,
benchmark_width_, benchmark_height_, 16);
SplitUVPlane(detiled_uv, orig_width, dst_u_two_stage,
SplitUVPlane(detiled_uv, tile_width, dst_u_two_stage,
(benchmark_width_ + 1) / 2, dst_v_two_stage,
(benchmark_width_ + 1) / 2, (benchmark_width_ + 1) / 2,
benchmark_height_);
@ -1715,7 +1754,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Correctness) {
EXPECT_EQ(dst_v_two_stage[i], dst_v_opt[i]);
}
free_aligned_buffer_page_end(orig_uv);
free_aligned_buffer_page_end(tile_uv);
free_aligned_buffer_page_end(detiled_uv);
free_aligned_buffer_page_end(dst_u_two_stage);
free_aligned_buffer_page_end(dst_u_opt);
@ -1727,17 +1766,17 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
int i, j;
// orig is tiled. Allocate enough memory for tiles.
int orig_width = (benchmark_width_ + 15) & ~15;
int orig_height = (benchmark_height_ + 15) & ~15;
int orig_plane_size = orig_width * orig_height;
int tile_width = (benchmark_width_ + 15) & ~15;
int tile_height = (benchmark_height_ + 15) & ~15;
int tile_plane_size = tile_width * tile_height;
int uv_plane_size = ((benchmark_width_ + 1) / 2) * benchmark_height_;
align_buffer_page_end(orig_uv, orig_plane_size);
align_buffer_page_end(tile_uv, tile_plane_size);
align_buffer_page_end(dst_u_c, uv_plane_size);
align_buffer_page_end(dst_u_opt, uv_plane_size);
align_buffer_page_end(dst_v_c, uv_plane_size);
align_buffer_page_end(dst_v_opt, uv_plane_size);
MemRandomize(orig_uv, orig_plane_size);
MemRandomize(tile_uv, tile_plane_size);
memset(dst_u_c, 0, uv_plane_size);
memset(dst_u_opt, 0, uv_plane_size);
memset(dst_v_c, 0, uv_plane_size);
@ -1746,7 +1785,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
DetileSplitUVPlane(orig_uv, orig_width, dst_u_c, (benchmark_width_ + 1) / 2,
DetileSplitUVPlane(tile_uv, tile_width, dst_u_c, (benchmark_width_ + 1) / 2,
dst_v_c, (benchmark_width_ + 1) / 2, benchmark_width_,
benchmark_height_, 16);
@ -1755,7 +1794,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
for (j = 0; j < benchmark_iterations_; j++) {
DetileSplitUVPlane(
orig_uv, orig_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
tile_uv, tile_width, dst_u_opt, (benchmark_width_ + 1) / 2, dst_v_opt,
(benchmark_width_ + 1) / 2, benchmark_width_, benchmark_height_, 16);
}
@ -1764,7 +1803,7 @@ TEST_F(LibYUVPlanarTest, TestDetileSplitUVPlane_Benchmark) {
EXPECT_EQ(dst_v_c[i], dst_v_opt[i]);
}
free_aligned_buffer_page_end(orig_uv);
free_aligned_buffer_page_end(tile_uv);
free_aligned_buffer_page_end(dst_u_c);
free_aligned_buffer_page_end(dst_u_opt);
free_aligned_buffer_page_end(dst_v_c);