mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
SSE2 MM21->YUY2 conversion
Add SSE2 optimization for MM21ToYUY2 conversion. Bug: b/238137982 Change-Id: I189f712514308322f651b082b496bce9c015c4ee Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3832525 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
65e7c9d570
commit
3e38ce5058
@ -293,6 +293,7 @@ extern "C" {
|
||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||
#define HAS_CONVERT8TO16ROW_SSE2
|
||||
#define HAS_DETILEROW_SSE2
|
||||
#define HAS_DETILETOYUY2_SSE2
|
||||
#define HAS_DETILESPLITUVROW_SSSE3
|
||||
#define HAS_HALFMERGEUVROW_SSSE3
|
||||
#define HAS_I210TOAR30ROW_SSSE3
|
||||
@ -2048,18 +2049,30 @@ void DetileToYUY2_C(const uint8_t* src_y,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
void DetileToYUY2_SSE2(const uint8_t* src_y,
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
void DetileToYUY2_Any_SSE2(const uint8_t* src_y,
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
void DetileToYUY2_NEON(const uint8_t* src_y,
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
void DetileToYUY2_Any_NEON(const uint8_t* src_y,
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width);
|
||||
void MergeUVRow_C(const uint8_t* src_u,
|
||||
const uint8_t* src_v,
|
||||
uint8_t* dst_uv,
|
||||
|
||||
@ -262,10 +262,10 @@ int I210ToI420(const uint16_t* src_y,
|
||||
height);
|
||||
ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
|
||||
dst_stride_u, src_u, dst_u, 0, 32768, dy,
|
||||
/*wpp=*/1, scale, kFilterBilinear);
|
||||
/*bpp=*/1, scale, kFilterBilinear);
|
||||
ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
|
||||
dst_stride_v, src_v, dst_v, 0, 32768, dy,
|
||||
/*wpp=*/1, scale, kFilterBilinear);
|
||||
/*bpp=*/1, scale, kFilterBilinear);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -915,7 +915,7 @@ int NV21ToNV12(const uint8_t* src_y,
|
||||
// tile width is 16 and assumed.
|
||||
// tile_height is 16 or 32 for MM21.
|
||||
// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
|
||||
// TODO(fbarchard): More detile row functions.
|
||||
// TODO: More detile row functions.
|
||||
|
||||
LIBYUV_API
|
||||
void DetilePlane(const uint8_t* src_y,
|
||||
@ -1074,6 +1074,15 @@ void DetileToYUY2(const uint8_t* src_y,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_DETILETOYUY2_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
DetileToYUY2 = DetileToYUY2_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
DetileToYUY2 = DetileToYUY2_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Detile plane
|
||||
for (y = 0; y < height; ++y) {
|
||||
DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride,
|
||||
@ -1081,9 +1090,8 @@ void DetileToYUY2(const uint8_t* src_y,
|
||||
dst_yuy2 += dst_stride_yuy2;
|
||||
src_y += 16;
|
||||
|
||||
if (y & 0x1) {
|
||||
if (y & 0x1)
|
||||
src_uv += 16;
|
||||
}
|
||||
|
||||
// Advance to next row of tiles.
|
||||
if ((y & (tile_height - 1)) == (tile_height - 1)) {
|
||||
|
||||
@ -2272,6 +2272,10 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
|
||||
ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_DETILETOYUY2_SSE2
|
||||
ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -4968,6 +4968,39 @@ void DetileRow_SSE2(const uint8_t* src,
|
||||
}
|
||||
#endif // HAS_DETILEROW_SSE2
|
||||
|
||||
#ifdef HAS_DETILETOYUY2_SSE2
|
||||
// Read 16 Y, 8 UV, and write 8 YUYV.
|
||||
void DetileToYUY2_SSE2(const uint8_t* src_y,
|
||||
ptrdiff_t src_y_tile_stride,
|
||||
const uint8_t* src_uv,
|
||||
ptrdiff_t src_uv_tile_stride,
|
||||
uint8_t* dst_yuy2,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n" // Load 16 Y
|
||||
"sub $0x10,%3 \n"
|
||||
"lea (%0,%4),%0 \n"
|
||||
"movdqu (%1),%%xmm1 \n" // Load 8 UV
|
||||
"lea (%1,%5),%1 \n"
|
||||
"movdqu %%xmm0,%%xmm2 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm2 \n"
|
||||
"movdqu %%xmm0,(%2) \n"
|
||||
"movdqu %%xmm2,0x10(%2) \n"
|
||||
"lea 0x20(%2),%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_uv), // %1
|
||||
"+r"(dst_yuy2), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(src_y_tile_stride), // %4
|
||||
"r"(src_uv_tile_stride) // %5
|
||||
: "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_DETILESPLITUVROW_SSSE3
|
||||
// TODO(greenjustin): Look into generating these constants instead of loading
|
||||
// them since this can cause branch mispredicts for fPIC code on 32-bit
|
||||
|
||||
@ -2039,7 +2039,7 @@ void ScalePlane_16(const uint16_t* src,
|
||||
}
|
||||
// Arbitrary scale vertically, but unscaled horizontally.
|
||||
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
|
||||
dst_stride, src, dst, 0, y, dy, /*wpp=*/1, filtering);
|
||||
dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
|
||||
return;
|
||||
}
|
||||
if (dst_width <= Abs(src_width) && dst_height <= src_height) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user