SSE2 MM21->YUY2 conversion

Add SSE2 optimization for MM21ToYUY2 conversion.

Bug: b/238137982
Change-Id: I189f712514308322f651b082b496bce9c015c4ee
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3832525
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Frank Barchard 2022-08-17 11:20:36 -07:00 committed by Frank Barchard
parent 65e7c9d570
commit 3e38ce5058
6 changed files with 74 additions and 16 deletions

View File

@ -293,6 +293,7 @@ extern "C" {
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
#define HAS_DETILEROW_SSE2
#define HAS_DETILETOYUY2_SSE2
#define HAS_DETILESPLITUVROW_SSSE3
#define HAS_HALFMERGEUVROW_SSSE3
#define HAS_I210TOAR30ROW_SSSE3
@ -2048,18 +2049,30 @@ void DetileToYUY2_C(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
void DetileToYUY2_SSE2(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
void DetileToYUY2_Any_SSE2(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
void DetileToYUY2_Any_NEON(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width);
void MergeUVRow_C(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,

View File

@ -262,10 +262,10 @@ int I210ToI420(const uint16_t* src_y,
height);
ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u,
dst_stride_u, src_u, dst_u, 0, 32768, dy,
/*wpp=*/1, scale, kFilterBilinear);
/*bpp=*/1, scale, kFilterBilinear);
ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v,
dst_stride_v, src_v, dst_v, 0, 32768, dy,
/*wpp=*/1, scale, kFilterBilinear);
/*bpp=*/1, scale, kFilterBilinear);
}
return 0;
}

View File

@ -915,7 +915,7 @@ int NV21ToNV12(const uint8_t* src_y,
// tile width is 16 and assumed.
// tile_height is 16 or 32 for MM21.
// src_stride_y is bytes per row of source ignoring tiling. e.g. 640
// TODO(fbarchard): More detile row functions.
// TODO: More detile row functions.
LIBYUV_API
void DetilePlane(const uint8_t* src_y,
@ -1074,6 +1074,15 @@ void DetileToYUY2(const uint8_t* src_y,
}
#endif
#if defined(HAS_DETILETOYUY2_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
DetileToYUY2 = DetileToYUY2_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
DetileToYUY2 = DetileToYUY2_SSE2;
}
}
#endif
// Detile plane
for (y = 0; y < height; ++y) {
DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride,
@ -1081,9 +1090,8 @@ void DetileToYUY2(const uint8_t* src_y,
dst_yuy2 += dst_stride_yuy2;
src_y += 16;
if (y & 0x1) {
if (y & 0x1)
src_uv += 16;
}
// Advance to next row of tiles.
if ((y & (tile_height - 1)) == (tile_height - 1)) {

View File

@ -2272,6 +2272,10 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15)
ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15)
#endif
#ifdef HAS_DETILETOYUY2_SSE2
ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15)
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -4968,6 +4968,39 @@ void DetileRow_SSE2(const uint8_t* src,
}
#endif // HAS_DETILEROW_SSE2
#ifdef HAS_DETILETOYUY2_SSE2
// Read 16 Y, 8 UV, and write 8 YUYV.
void DetileToYUY2_SSE2(const uint8_t* src_y,
ptrdiff_t src_y_tile_stride,
const uint8_t* src_uv,
ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2,
int width) {
asm volatile(
"1: \n"
"movdqu (%0),%%xmm0 \n" // Load 16 Y
"sub $0x10,%3 \n"
"lea (%0,%4),%0 \n"
"movdqu (%1),%%xmm1 \n" // Load 8 UV
"lea (%1,%5),%1 \n"
"movdqu %%xmm0,%%xmm2 \n"
"punpcklbw %%xmm1,%%xmm0 \n"
"punpckhbw %%xmm1,%%xmm2 \n"
"movdqu %%xmm0,(%2) \n"
"movdqu %%xmm2,0x10(%2) \n"
"lea 0x20(%2),%2 \n"
"jg 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_yuy2), // %2
"+r"(width) // %3
: "r"(src_y_tile_stride), // %4
"r"(src_uv_tile_stride) // %5
: "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list
);
}
#endif
#ifdef HAS_DETILESPLITUVROW_SSSE3
// TODO(greenjustin): Look into generating these constants instead of loading
// them since this can cause branch mispredicts for fPIC code on 32-bit

View File

@ -2039,7 +2039,7 @@ void ScalePlane_16(const uint16_t* src,
}
// Arbitrary scale vertically, but unscaled horizontally.
ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
dst_stride, src, dst, 0, y, dy, /*wpp=*/1, filtering);
dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering);
return;
}
if (dst_width <= Abs(src_width) && dst_height <= src_height) {