Change rectangle low level functions to use more conventional row functions including 'any' variations. Previously the yuv function SetPlane stored 32 bit values. Now a more conventional memset() style function is used for YUV that stores bytes. On Haswell a rep stosb is used for YUV. Overall benefit of this CL is improved performance for 'any' width, and simpler row assembly instead of full image assembly. Previously ARGBRect used a low level function that supported a rectangle in assembly. Now it uses a row function, and relies on row coalesce to combine into a single low level call.

BUG=371
TESTED=untested
R=brucedawson@google.com, harryjin@google.com

Review URL: https://webrtc-codereview.appspot.com/35689004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@1222 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2015-01-12 03:58:24 +00:00
parent 89671c4de1
commit b2a6af1be6
10 changed files with 129 additions and 93 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1220 Version: 1222
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -114,7 +114,8 @@ extern "C" {
#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3
#define HAS_SETROW_X86 #define HAS_SETROW_X86
#define HAS_ARGBSETROWS_X86 #define HAS_SETROW_ERMS
#define HAS_ARGBSETROW_X86
#define HAS_SPLITUVROW_SSE2 #define HAS_SPLITUVROW_SSE2
#define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOARGBROW_SSSE3
#define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUV422ROW_SSE2
@ -302,7 +303,8 @@ extern "C" {
#define HAS_RGB565TOYROW_NEON #define HAS_RGB565TOYROW_NEON
#define HAS_RGBATOUVROW_NEON #define HAS_RGBATOUVROW_NEON
#define HAS_RGBATOYROW_NEON #define HAS_RGBATOYROW_NEON
// #define HAS_SETROW_NEON #define HAS_SETROW_NEON
#define HAS_ARGBSETROW_NEON
#define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_NEON
#define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUV422ROW_NEON
@ -332,7 +334,6 @@ extern "C" {
#define HAS_SOBELXYROW_NEON #define HAS_SOBELXYROW_NEON
#define HAS_SOBELYROW_NEON #define HAS_SOBELYROW_NEON
#define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBCOLORMATRIXROW_NEON
// #define HAS_ARGBSETROWS_NEON
#define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSHUFFLEROW_NEON
#endif #endif
@ -800,15 +801,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
void SetRow_C(uint8* dst, uint32 v32, int count); void SetRow_C(uint8* dst, uint8 v8, int count);
void SetRow_X86(uint8* dst, uint32 v32, int count); void SetRow_X86(uint8* dst, uint8 v8, int count);
void SetRow_NEON(uint8* dst, uint32 v32, int count); void SetRow_ERMS(uint8* dst, uint8 v8, int count);
void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, void SetRow_NEON(uint8* dst, uint8 v8, int count);
int height); void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
int dst_stride, int height);
void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
int dst_stride, int height); void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
// ARGBShufflers for BGRAToARGB etc. // ARGBShufflers for BGRAToARGB etc.
void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1220 #define LIBYUV_VERSION 1222
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT

View File

@ -1094,8 +1094,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
int width, int height, int width, int height,
uint32 value) { uint32 value) {
int y; int y;
uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C;
void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;
if (height < 0) { if (height < 0) {
height = -height; height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y; dst_y = dst_y + (height - 1) * dst_stride_y;
@ -1108,19 +1107,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
dst_stride_y = 0; dst_stride_y = 0;
} }
#if defined(HAS_SETROW_NEON) #if defined(HAS_SETROW_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { if (TestCpuFlag(kCpuHasNEON)) {
SetRow = SetRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SetRow = SetRow_NEON; SetRow = SetRow_NEON;
} }
}
#endif #endif
#if defined(HAS_SETROW_X86) #if defined(HAS_SETROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { if (TestCpuFlag(kCpuHasX86)) {
SetRow = SetRow_Any_X86;
if (IS_ALIGNED(width, 4)) {
SetRow = SetRow_X86; SetRow = SetRow_X86;
} }
}
#endif
#if defined(HAS_SETROW_ERMS)
if (TestCpuFlag(kCpuHasERMS)) {
SetRow = SetRow_ERMS;
}
#endif #endif
// Set plane // Set plane
for (y = 0; y < height; ++y) { for (y = 0; y < height; ++y) {
SetRow(dst_y, v32, width); SetRow(dst_y, value, width);
dst_y += dst_stride_y; dst_y += dst_stride_y;
} }
} }
@ -1139,7 +1149,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
if (!dst_y || !dst_u || !dst_v || if (!dst_y || !dst_u || !dst_v ||
width <= 0 || height <= 0 || width <= 0 || height == 0 ||
x < 0 || y < 0 || x < 0 || y < 0 ||
value_y < 0 || value_y > 255 || value_y < 0 || value_y > 255 ||
value_u < 0 || value_u > 255 || value_u < 0 || value_u > 255 ||
@ -1159,6 +1169,8 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
int dst_x, int dst_y, int dst_x, int dst_y,
int width, int height, int width, int height,
uint32 value) { uint32 value) {
int y;
void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C;
if (!dst_argb || if (!dst_argb ||
width <= 0 || height == 0 || width <= 0 || height == 0 ||
dst_x < 0 || dst_y < 0) { dst_x < 0 || dst_y < 0) {
@ -1176,19 +1188,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
height = 1; height = 1;
dst_stride_argb = 0; dst_stride_argb = 0;
} }
#if defined(HAS_ARGBSETROWS_NEON)
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { #if defined(HAS_ARGBSETROW_NEON)
ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); if (TestCpuFlag(kCpuHasNEON)) {
return 0; ARGBSetRow = ARGBSetRow_Any_NEON;
if (IS_ALIGNED(width, 4)) {
ARGBSetRow = ARGBSetRow_NEON;
}
} }
#endif #endif
#if defined(HAS_ARGBSETROWS_X86) #if defined(HAS_ARGBSETROW_X86)
if (TestCpuFlag(kCpuHasX86)) { if (TestCpuFlag(kCpuHasX86)) {
ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); ARGBSetRow = ARGBSetRow_X86;
return 0;
} }
#endif #endif
ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);
// Set plane
for (y = 0; y < height; ++y) {
ARGBSetRow(dst_argb, value, width);
dst_argb += dst_stride_argb;
}
return 0; return 0;
} }

View File

@ -681,6 +681,27 @@ MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31)
#endif #endif
#undef MANY #undef MANY
#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \
void NAMEANY(uint8* dst_y, T v8, int width) { \
int n = width & ~MASK; \
int r = width & MASK; \
if (n > 0) { \
SET_SIMD(dst_y, v8, n); \
} \
SET_C(dst_y + n * BPP, v8, r); \
}
#ifdef HAS_SETROW_X86
SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3)
#endif
#ifdef HAS_SETROW_NEON
SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15)
#endif
#ifdef HAS_ARGBSETROW_NEON
SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3)
#endif
#undef SETANY
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -1623,29 +1623,16 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
memcpy(dst, src, count * 2); memcpy(dst, src, count * 2);
} }
void SetRow_C(uint8* dst, uint32 v8, int count) { void SetRow_C(uint8* dst, uint8 v8, int width) {
#ifdef _MSC_VER memset(dst, v8, width);
// VisualC will generate rep stosb.
int x;
for (x = 0; x < count; ++x) {
dst[x] = v8;
}
#else
memset(dst, v8, count);
#endif
} }
void ARGBSetRows_C(uint8* dst, uint32 v32, int width, void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
int dst_stride, int height) { uint32* d = (uint32*)(dst_argb);
int y;
for (y = 0; y < height; ++y) {
uint32* d = (uint32*)(dst);
int x; int x;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
d[x] = v32; d[x] = v32;
} }
dst += dst_stride;
}
} }
// Filter 2 rows of YUY2 UV's (422) into U and V (420). // Filter 2 rows of YUY2 UV's (422) into U and V (420).

View File

@ -846,7 +846,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
} }
// SetRow writes 'count' bytes using an 8 bit value repeated. // SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v8, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile ( asm volatile (
"vdup.8 q0, %2 \n" // duplicate 16 bytes "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n" "1: \n"

View File

@ -736,7 +736,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
#endif // HAS_COPYROW_NEON #endif // HAS_COPYROW_NEON
// SetRow writes 'count' bytes using an 8 bit value repeated. // SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint32 v8, int count) { void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile ( asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes "dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n" "1: \n"

View File

@ -2893,10 +2893,10 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
void SetRow_X86(uint8* dst, uint32 v32, int width) { void SetRow_X86(uint8* dst, uint8 v8, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width >> 2);
const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes.
asm volatile ( asm volatile (
"shr $0x2,%1 \n"
"rep stosl " MEMSTORESTRING(eax,0) " \n" "rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(dst), // %0 : "+D"(dst), // %0
"+c"(width_tmp) // %1 "+c"(width_tmp) // %1
@ -2904,19 +2904,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) {
: "memory", "cc"); : "memory", "cc");
} }
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
int dst_stride, int height) { size_t width_tmp = (size_t)(width);
for (int y = 0; y < height; ++y) { asm volatile (
"rep stosb " MEMSTORESTRING(al,0) " \n"
: "+D"(dst), // %0
"+c"(width_tmp) // %1
: "a"(v8) // %2
: "memory", "cc");
}
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width);
uint32* d = (uint32*)(dst);
asm volatile ( asm volatile (
"rep stosl " MEMSTORESTRING(eax,0) " \n" "rep stosl " MEMSTORESTRING(eax,0) " \n"
: "+D"(d), // %0 : "+D"(dst_argb), // %0
"+c"(width_tmp) // %1 "+c"(width_tmp) // %1
: "a"(v32) // %2 : "a"(v32) // %2
: "memory", "cc"); : "memory", "cc");
dst += dst_stride;
}
} }
#endif // HAS_SETROW_X86 #endif // HAS_SETROW_X86

View File

@ -2848,13 +2848,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
#ifdef HAS_SETROW_X86 #ifdef HAS_SETROW_X86
// SetRow writes 'count' bytes using a 32 bit value repeated. // Write 'count' bytes using an 8 bit value repeated.
// Count should be multiple of 4.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void SetRow_X86(uint8* dst, uint32 v32, int count) { void SetRow_X86(uint8* dst, uint8 v8, int count) {
__asm { __asm {
movzx eax, byte ptr [esp + 8] // v8
mov edx, 0x01010101 // Duplicate byte to all bytes.
mul edx // overwrites edx with upper part of result.
mov edx, edi mov edx, edi
mov edi, [esp + 4] // dst mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count mov ecx, [esp + 12] // count
shr ecx, 2 shr ecx, 2
rep stosd rep stosd
@ -2863,32 +2866,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) {
} }
} }
// SetRow32 writes 'count' words using a 32 bit value repeated. // Write 'count' bytes using an 8 bit value repeated.
__declspec(naked) __declspec(align(16)) __declspec(naked) __declspec(align(16))
void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
int dst_stride, int height) {
__asm { __asm {
push esi mov edx, edi
push edi mov edi, [esp + 4] // dst
push ebp mov eax, [esp + 8] // v8
mov edi, [esp + 12 + 4] // dst mov ecx, [esp + 12] // count
mov eax, [esp + 12 + 8] // v32 rep stosb
mov ebp, [esp + 12 + 12] // width mov edi, edx
mov edx, [esp + 12 + 16] // dst_stride ret
mov esi, [esp + 12 + 20] // height }
lea ecx, [ebp * 4] }
sub edx, ecx // stride - width * 4
convertloop: // Write 'count' 32 bit values.
mov ecx, ebp __declspec(naked) __declspec(align(16))
void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
__asm {
mov edx, edi
mov edi, [esp + 4] // dst
mov eax, [esp + 8] // v32
mov ecx, [esp + 12] // count
rep stosd rep stosd
add edi, edx mov edi, edx
sub esi, 1
jg convertloop
pop ebp
pop edi
pop esi
ret ret
} }
} }