diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7ef0000b3..70d789e29 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -293,6 +293,7 @@ extern "C" { #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_DETILEROW_SSE2 +#define HAS_DETILETOYUY2_SSE2 #define HAS_DETILESPLITUVROW_SSSE3 #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 @@ -2048,18 +2049,30 @@ void DetileToYUY2_C(const uint8_t* src_y, ptrdiff_t src_uv_tile_stride, uint8_t* dst_yuy2, int width); +void DetileToYUY2_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); +void DetileToYUY2_Any_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); void DetileToYUY2_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width); + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); void DetileToYUY2_Any_NEON(const uint8_t* src_y, - ptrdiff_t src_y_tile_stride, - const uint8_t* src_uv, - ptrdiff_t src_uv_tile_stride, - uint8_t* dst_yuy2, - int width); + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, diff --git a/source/convert.cc b/source/convert.cc index 37066721e..a740d5ca7 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -262,10 +262,10 @@ int I210ToI420(const uint16_t* src_y, height); ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_u, dst_stride_u, src_u, dst_u, 0, 32768, dy, - /*wpp=*/1, scale, kFilterBilinear); + /*bpp=*/1, scale, kFilterBilinear); ScalePlaneVertical_16To8(height, uv_width, uv_height, src_stride_v, dst_stride_v, src_v, dst_v, 0, 32768, dy, - /*wpp=*/1, scale, kFilterBilinear); + /*bpp=*/1, scale, kFilterBilinear); } return 0; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fae8630e9..1de71dbb0 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -915,7 +915,7 @@ int NV21ToNV12(const uint8_t* src_y, // tile width is 16 and assumed. // tile_height is 16 or 32 for MM21. // src_stride_y is bytes per row of source ignoring tiling. e.g. 640 -// TODO(fbarchard): More detile row functions. +// TODO: More detile row functions. LIBYUV_API void DetilePlane(const uint8_t* src_y, @@ -1074,6 +1074,15 @@ void DetileToYUY2(const uint8_t* src_y, } #endif +#if defined(HAS_DETILETOYUY2_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + DetileToYUY2 = DetileToYUY2_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + DetileToYUY2 = DetileToYUY2_SSE2; + } + } +#endif + // Detile plane for (y = 0; y < height; ++y) { DetileToYUY2(src_y, src_y_tile_stride, src_uv, src_uv_tile_stride, @@ -1081,9 +1090,8 @@ void DetileToYUY2(const uint8_t* src_y, dst_yuy2 += dst_stride_yuy2; src_y += 16; - if (y & 0x1) { + if (y & 0x1) src_uv += 16; - } // Advance to next row of tiles. if ((y & (tile_height - 1)) == (tile_height - 1)) { diff --git a/source/row_any.cc b/source/row_any.cc index 5270e86cd..bd46ba1b5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2272,6 +2272,10 @@ ANYDETILESPLITUV(DetileSplitUVRow_Any_SSSE3, DetileSplitUVRow_SSSE3, 15) ANYDETILEMERGE(DetileToYUY2_Any_NEON, DetileToYUY2_NEON, 15) #endif +#ifdef HAS_DETILETOYUY2_SSE2 +ANYDETILEMERGE(DetileToYUY2_Any_SSE2, DetileToYUY2_SSE2, 15) +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 88766785d..8d0f477c5 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4968,6 +4968,39 @@ void DetileRow_SSE2(const uint8_t* src, } #endif // HAS_DETILEROW_SSE2 +#ifdef HAS_DETILETOYUY2_SSE2 +// Read 16 Y, 8 UV, and write 8 YUYV. +void DetileToYUY2_SSE2(const uint8_t* src_y, + ptrdiff_t src_y_tile_stride, + const uint8_t* src_uv, + ptrdiff_t src_uv_tile_stride, + uint8_t* dst_yuy2, + int width) { + asm volatile( + "1: \n" + "movdqu (%0),%%xmm0 \n" // Load 16 Y + "sub $0x10,%3 \n" + "lea (%0,%4),%0 \n" + "movdqu (%1),%%xmm1 \n" // Load 8 UV + "lea (%1,%5),%1 \n" + "movdqu %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_yuy2), // %2 + "+r"(width) // %3 + : "r"(src_y_tile_stride), // %4 + "r"(src_uv_tile_stride) // %5 + : "cc", "memory", "xmm0", "xmm1", "xmm2" // Clobber list + ); +} +#endif + #ifdef HAS_DETILESPLITUVROW_SSSE3 // TODO(greenjustin): Look into generating these constants instead of loading // them since this can cause branch mispredicts for fPIC code on 32-bit diff --git a/source/scale.cc b/source/scale.cc index 4980f42d6..e1335f1ee 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -2039,7 +2039,7 @@ void ScalePlane_16(const uint16_t* src, } // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, - dst_stride, src, dst, 0, y, dy, /*wpp=*/1, filtering); + dst_stride, src, dst, 0, y, dy, /*bpp=*/1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) {