From 6a192487fe9bd15a56b42b1f6adb512f58bc4009 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 20 Feb 2015 22:46:15 +0000 Subject: [PATCH] Switch SSSE3 row wrappers from variable sized malloc to fixed size array with loop to process a portion of the row at a time. This helps performance in the case where the image has been coalesced into a single large row and the allocator, although only called once, is slow to clear the pages. Also the smaller temporary buffer fits cache, further improving performance. BUG=403 TESTED=YUY2ToARGB unittest R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/40849004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1286 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_common.cc | 148 ++++++++++++++++++++++++--------------- 3 files changed, 94 insertions(+), 58 deletions(-) diff --git a/README.chromium b/README.chromium index d7e620034..cf14a1482 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1285 +Version: 1286 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 0334e9b13..ff43a1351 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1285 +#define LIBYUV_VERSION 1286 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_common.cc b/source/row_common.cc index 53f41ae6c..42ab76b76 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2122,6 +2122,9 @@ void I422ToUYVYRow_C(const uint8* src_y, } } +// Maximum temporary width for wrappers to process at a time, in pixels. +#define MAXTWIDTH 4096 + #if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. #if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) @@ -2130,11 +2133,17 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* rgb_buf, int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); - ARGBToRGB565Row_SSE2(row, rgb_buf, width); - free_aligned_buffer_64(row); + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToRGB565Row_SSE2(row, rgb_buf, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_argb += twidth * 2; + width -= twidth; + } } #endif // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__)) @@ -2144,11 +2153,18 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* rgb_buf, int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); - ARGBToARGB1555Row_SSE2(row, rgb_buf, width); - free_aligned_buffer_64(row); + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToARGB1555Row_SSE2(row, rgb_buf, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + rgb_buf += twidth * 2; + width -= twidth; + } } void I422ToARGB4444Row_SSSE3(const uint8* src_y, @@ -2156,61 +2172,81 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* rgb_buf, int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width); - ARGBToARGB4444Row_SSE2(row, rgb_buf, width); - free_aligned_buffer_64(row); + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, twidth); + ARGBToARGB4444Row_SSE2(row, rgb_buf, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + rgb_buf += twidth * 2; + width -= twidth; + } } -void NV12ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_uv, - uint8* dst_rgb565, - int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - NV12ToARGBRow_SSSE3(src_y, src_uv, row, width); - ARGBToRGB565Row_SSE2(row, dst_rgb565, width); - free_aligned_buffer_64(row); +void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV12ToARGBRow_SSSE3(src_y, src_uv, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_uv += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } } -void NV21ToRGB565Row_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_rgb565, - int width) { - // Allocate a row of ARGB. - align_buffer_64(row, width * 4); - NV21ToARGBRow_SSSE3(src_y, src_vu, row, width); - ARGBToRGB565Row_SSE2(row, dst_rgb565, width); - free_aligned_buffer_64(row); +void NV21ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_vu, + uint8* dst_rgb565, int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + NV21ToARGBRow_SSSE3(src_y, src_vu, row, twidth); + ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); + src_y += twidth; + src_vu += twidth; + dst_rgb565 += twidth * 2; + width -= twidth; + } } -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width); - YUY2ToYRow_SSE2(src_yuy2, row_y, width); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth); + YUY2ToYRow_SSE2(src_yuy2, row_y, twidth); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); + src_yuy2 += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } } -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - int width) { - // Allocate a rows of yuv. - align_buffer_64(row_y, ((width + 63) & ~63) * 2); - uint8* row_u = row_y + ((width + 63) & ~63); - uint8* row_v = row_u + ((width + 63) & ~63) / 2; - UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width); - UYVYToYRow_SSE2(src_uyvy, row_y, width); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width); - free_aligned_buffer_64(row_y); +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { + // Row buffers for intermediate YUV pixels. + SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); + SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); + SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth); + UYVYToYRow_SSE2(src_uyvy, row_y, twidth); + I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, twidth); + src_uyvy += twidth * 2; + dst_argb += twidth * 4; + width -= twidth; + } } - #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86)