From 810cd91079505f04cfec7481b51d04f08250d982 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 20 Apr 2012 20:15:27 +0000 Subject: [PATCH] ARGBUnattenuateRow_SSE2 use reciprocal table and pmul BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/497001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@244 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- source/planar_functions.cc | 76 +---------------- source/row.h | 14 ++-- source/row_common.cc | 73 +++++++++++++++++ source/row_posix.cc | 162 +++++++++++++++++++++++++++++++++++-- source/row_win.cc | 107 ++++++++++++++++++++++-- unit_test/compare_test.cc | 66 +++++++-------- unit_test/cpu_test.cc | 3 +- unit_test/planar_test.cc | 104 +++++++++++++++--------- unit_test/rotate_test.cc | 49 ++++++----- unit_test/scale_test.cc | 3 +- unit_test/unit_test.cc | 14 ++-- unit_test/unit_test.h | 42 +++++----- 12 files changed, 488 insertions(+), 225 deletions(-) diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2f005736f..0e0b2428b 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -10,6 +10,7 @@ #include "libyuv/planar_functions.h" +#include // printf() #include // for memset() #include "libyuv/cpu_id.h" @@ -909,80 +910,6 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, return 0; } -// Divide source RGB by alpha and store to destination. -// b = (b * 255 + (a / 2)) / a; -// g = (g * 255 + (a / 2)) / a; -// r = (r * 255 + (a / 2)) / a; -// Reciprocal method is off by 1 on some values. ie 125 -// 8.16 fixed point inverse table -#define T(a) 0x1000000 / a -static uint32 fixed_invtbl[256] = { - 0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), - T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), - T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), - T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), - T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), - T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), - T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), - T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), - T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), - T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), - T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), - T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), - T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), - T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), - T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), - T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), - T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), - T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), - T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), - T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), - T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), - T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), - T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), - T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), - T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), - T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), - T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), - T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), - T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), - T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), - T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), - T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) }; -#undef T - -static void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, - int width) { - for (int i = 0; i < width; ++i) { - uint32 b = src_argb[0]; - uint32 g = src_argb[1]; - uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; - if (a) { - const uint32 ia = fixed_invtbl[a]; // 8.16 fixed point - b = (b * ia + 0x8000) >> 16; - g = (g * ia + 0x8000) >> 16; - r = (r * ia + 0x8000) >> 16; - // Clamping should not be necessary but is free in assembly. - if (b > 255) { - b = 255; - } - if (g > 255) { - g = 255; - } - if (r > 255) { - r = 255; - } - } - dst_argb[0] = b; - dst_argb[1] = g; - dst_argb[2] = r; - dst_argb[3] = a; - src_argb += 4; - dst_argb += 4; - } -} - // Convert unattentuated ARGB values to preattenuated ARGB. int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, @@ -1010,7 +937,6 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, return 0; } - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row.h b/source/row.h index fb9040ec2..504175529 100644 --- a/source/row.h +++ b/source/row.h @@ -67,11 +67,8 @@ extern "C" { #define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBATTENUATE_SSE2 -#endif - -// The following are available on Windows 32 bit -#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBATTENUATE_SSSE3 +#define HAS_ARGBUNATTENUATE_SSE2 #endif // The following are available on Neon platforms @@ -312,11 +309,11 @@ void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width); + uint8* dst_u, uint8* dst_v, int width); void I420ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, @@ -370,6 +367,9 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 3df51bfd1..6ebb48e52 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -700,6 +700,79 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } } +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.16 fixed point inverse table +#define T(a) 0x1000000 / a +static uint32 fixed_invtbl[256] = { + 0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) }; +#undef T + +void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int i = 0; i < width; ++i) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + if (a) { + const uint32 ia = fixed_invtbl[a]; // 8.16 fixed point + b = (b * ia + 0x8000) >> 16; + g = (g * ia + 0x8000) >> 16; + r = (r * ia + 0x8000) >> 16; + // Clamping should not be necessary but is free in assembly. + if (b > 255) { + b = 255; + } + if (g > 255) { + g = 255; + } + if (r > 255) { + r = 255; + } + } + dst_argb[0] = b; + dst_argb[1] = g; + dst_argb[2] = r; + dst_argb[3] = a; + src_argb += 4; + dst_argb += 4; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_posix.cc b/source/row_posix.cc index 33828b65e..8d25df0d6 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1730,6 +1730,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { asm volatile ( "sub %0,%1 \n" + ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -2192,9 +2193,9 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "movdqu 0x10(%0),%%xmm3 \n" "lea 0x20(%0),%0 \n" "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" + "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" + "paddusb %%xmm1,%%xmm0 \n" "sub $0x4,%3 \n" "movdqa %%xmm0,(%2) \n" "jle 9f \n" @@ -2242,6 +2243,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pslld $0x18,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x8,%%xmm5 \n" + // 4 pixel loop "1: \n" "movdqa (%0),%%xmm0 \n" @@ -2254,13 +2256,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { "pshufhw $0xff,%%xmm1,%%xmm2 \n" "pshuflw $0xff,%%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" - "movdqa (%0),%%xmm3 \n" + "movdqa (%0),%%xmm2 \n" "psrlw $0x8,%%xmm0 \n" - "pand %%xmm4,%%xmm3 \n" + "pand %%xmm4,%%xmm2 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "pand %%xmm5,%%xmm0 \n" - "por %%xmm3,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" "sub $0x4,%2 \n" "movdqa %%xmm0,(%0,%1,1) \n" "lea 0x10(%0),%0 \n" @@ -2277,6 +2279,156 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { } #endif // HAS_ARGBATTENUATE_SSE2 +#ifdef HAS_ARGBATTENUATE_SSSE3 +// Shuffle table duplicating alpha +CONST uvec8 kShuffleAlpha0 = { + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, +}; +CONST uvec8 kShuffleAlpha1 = { + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, +}; +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + // 4 pixel loop + "1: \n" + "movdqa (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATE_SSSE3 + +#ifdef HAS_ARGBUNATTENUATE_SSE2 +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.16 fixed point inverse table +#define T(a) 0x10000 / a +CONST uint32 fixed_invtbl8[256] = { + 0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 }; +#undef T + +// Unattenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + uintptr_t alpha = 0; + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + + // 4 pixel loop + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movzb 0x3(%0),%3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movd 0x0(%4,%3,4),%%xmm2 \n" + "movzb 0x7(%0),%3 \n" + "movd 0x0(%4,%3,4),%%xmm3 \n" + "pshuflw $0xc0,%%xmm2,%%xmm2 \n" + "pshuflw $0xc0,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "movzb 0xb(%0),%3 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movd 0x0(%4,%3,4),%%xmm2 \n" + "movzb 0xf(%0),%3 \n" + "movd 0x0(%4,%3,4),%%xmm3 \n" + "pshuflw $0xc0,%%xmm2,%%xmm2 \n" + "pshuflw $0xc0,%%xmm3,%%xmm3 \n" + "movlhps %%xmm3,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa (%0),%%xmm2 \n" + "pand %%xmm4,%%xmm2 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width), // %2 + "+r"(alpha) // %3 + : "r"(fixed_invtbl8) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBUNATTENUATE_SSE2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index c0519f295..a140fce5f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2319,13 +2319,13 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { pshufhw xmm2, xmm1,0FFh // 8 alpha words pshuflw xmm2, xmm2,0FFh pmulhuw xmm1, xmm2 // rgb * a - movdqa xmm3, [eax] // alphas + movdqa xmm2, [eax] // alphas psrlw xmm0, 8 - pand xmm3, xmm4 + pand xmm2, xmm4 psrlw xmm1, 8 packuswb xmm0, xmm1 pand xmm0, xmm5 // keep original alphas - por xmm0, xmm3 + por xmm0, xmm2 sub ecx, 4 movdqa [eax + edx], xmm0 lea eax, [eax + 16] @@ -2347,7 +2347,6 @@ static const uvec8 kShuffleAlpha1 = { }; __declspec(naked) __declspec(align(16)) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { - __asm { mov eax, [esp + 4] // src_argb0 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width @@ -2360,7 +2359,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { align 16 convertloop: movdqa xmm0, [eax] // read 4 pixels - pshufb xmm0, xmm4 // isolate first 2 alphas + pshufb xmm0, xmm4 // isolate first 2 alphas movdqa xmm1, [eax] // read 4 pixels punpcklbw xmm1, xmm1 // first 2 pixel rgbs pmulhuw xmm0, xmm1 // rgb * a @@ -2383,9 +2382,105 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { ret } } - #endif // HAS_ARGBATTENUATE_SSSE3 +#ifdef HAS_ARGBUNATTENUATE_SSE2 +// Divide source RGB by alpha and store to destination. +// b = (b * 255 + (a / 2)) / a; +// g = (g * 255 + (a / 2)) / a; +// r = (r * 255 + (a / 2)) / a; +// Reciprocal method is off by 1 on some values. ie 125 +// 8.16 fixed point inverse table +#define T(a) 0x10000 / a +static uint32 fixed_invtbl8[256] = { + 0x100, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x100 }; +#undef T + +// Unattenuate 4 pixels at a time. +// aligned to 16 bytes +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb0 + mov edx, [esp + 8 + 8] // dst_argb + mov ecx, [esp + 8 + 12] // width + sub edx, eax + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 3] // first alpha + movzx edi, byte ptr [eax + 7] // second alpha + punpcklbw xmm0, xmm0 // first 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words + pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm0, xmm2 // rgb * a + + movdqa xmm1, [eax] // read 4 pixels + movzx esi, byte ptr [eax + 11] // third alpha + movzx edi, byte ptr [eax + 15] // forth alpha + punpckhbw xmm1, xmm1 // next 2 + movd xmm2, dword ptr fixed_invtbl8[esi * 4] + movd xmm3, dword ptr fixed_invtbl8[edi * 4] + pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words + pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words + movlhps xmm2, xmm3 + pmulhuw xmm1, xmm2 // rgb * a + + movdqa xmm2, [eax] // alphas + pand xmm2, xmm4 + packuswb xmm0, xmm1 + por xmm0, xmm2 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + pop edi + pop esi + ret + } +} +#endif // HAS_ARGBUNATTENUATE_SSE2 + #endif // _M_IX86 #ifdef __cplusplus diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index d5400b07e..18ff2185f 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -63,7 +63,7 @@ TEST_F(libyuvTest, BenchmakDjb2_C) { uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); uint32 h1; MaskCpuFlags(kCpuInitialized); - for (int i = 0; i < _benchmark_iterations; ++i) { + for (int i = 0; i < benchmark_iterations_; ++i) { h1 = HashDjb2(src_a, kMaxTest, 5381); } MaskCpuFlags(-1); @@ -80,7 +80,7 @@ TEST_F(libyuvTest, BenchmakDjb2_OPT) { } uint32 h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); uint32 h1; - for (int i = 0; i < _benchmark_iterations; ++i) { + for (int i = 0; i < benchmark_iterations_; ++i) { h1 = HashDjb2(src_a, kMaxTest, 5381); } EXPECT_EQ(h1, h2); @@ -96,7 +96,7 @@ TEST_F(libyuvTest, BenchmakDjb2_Unaligned_OPT) { } uint32 h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381); uint32 h1; - for (int i = 0; i < _benchmark_iterations; ++i) { + for (int i = 0; i < benchmark_iterations_; ++i) { h1 = HashDjb2(src_a + 1, kMaxTest, 5381); } EXPECT_EQ(h1, h2); @@ -110,7 +110,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_C) { align_buffer_16(src_b, max_width) MaskCpuFlags(kCpuInitialized); - for (int i = 0; i < _benchmark_iterations; ++i) { + for (int i = 0; i < benchmark_iterations_; ++i) { ComputeSumSquareError(src_a, src_b, max_width); } @@ -128,7 +128,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_OPT) { align_buffer_16(src_a, max_width) align_buffer_16(src_b, max_width) - for (int i = 0; i < _benchmark_iterations; ++i) { + for (int i = 0; i < benchmark_iterations_; ++i) { ComputeSumSquareError(src_a, src_b, max_width); } @@ -183,18 +183,18 @@ TEST_F(libyuvTest, SumSquareError) { } TEST_F(libyuvTest, BenchmarkPsnr_C) { - align_buffer_16(src_a, _benchmark_width * _benchmark_height) - align_buffer_16(src_b, _benchmark_width * _benchmark_height) + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) MaskCpuFlags(kCpuInitialized); double c_time = get_time(); - for (int i = 0; i < _benchmark_iterations; ++i) - CalcFramePsnr(src_a, _benchmark_width, - src_b, _benchmark_width, - _benchmark_width, _benchmark_height); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFramePsnr(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); - c_time = (get_time() - c_time) / _benchmark_iterations; + c_time = (get_time() - c_time) / benchmark_iterations_; printf("BenchmarkPsnr_C - %8.2f us c\n", c_time * 1e6); MaskCpuFlags(-1); @@ -206,18 +206,18 @@ TEST_F(libyuvTest, BenchmarkPsnr_C) { } TEST_F(libyuvTest, BenchmarkPsnr_OPT) { - align_buffer_16(src_a, _benchmark_width * _benchmark_height) - align_buffer_16(src_b, _benchmark_width * _benchmark_height) + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) MaskCpuFlags(-1); double opt_time = get_time(); - for (int i = 0; i < _benchmark_iterations; ++i) - CalcFramePsnr(src_a, _benchmark_width, - src_b, _benchmark_width, - _benchmark_width, _benchmark_height); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFramePsnr(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); - opt_time = (get_time() - opt_time) / _benchmark_iterations; + opt_time = (get_time() - opt_time) / benchmark_iterations_; printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6); EXPECT_EQ(0, 0); @@ -304,18 +304,18 @@ TEST_F(libyuvTest, Psnr) { } TEST_F(libyuvTest, BenchmarkSsim_C) { - align_buffer_16(src_a, _benchmark_width * _benchmark_height) - align_buffer_16(src_b, _benchmark_width * _benchmark_height) + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) MaskCpuFlags(kCpuInitialized); double c_time = get_time(); - for (int i = 0; i < _benchmark_iterations; ++i) - CalcFrameSsim(src_a, _benchmark_width, - src_b, _benchmark_width, - _benchmark_width, _benchmark_height); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFrameSsim(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); - c_time = (get_time() - c_time) / _benchmark_iterations; + c_time = (get_time() - c_time) / benchmark_iterations_; printf("BenchmarkSsim_C - %8.2f us c\n", c_time * 1e6); MaskCpuFlags(-1); @@ -327,18 +327,18 @@ TEST_F(libyuvTest, BenchmarkSsim_C) { } TEST_F(libyuvTest, BenchmarkSsim_OPT) { - align_buffer_16(src_a, _benchmark_width * _benchmark_height) - align_buffer_16(src_b, _benchmark_width * _benchmark_height) + align_buffer_16(src_a, benchmark_width_ * benchmark_height_) + align_buffer_16(src_b, benchmark_width_ * benchmark_height_) MaskCpuFlags(-1); double opt_time = get_time(); - for (int i = 0; i < _benchmark_iterations; ++i) - CalcFrameSsim(src_a, _benchmark_width, - src_b, _benchmark_width, - _benchmark_width, _benchmark_height); + for (int i = 0; i < benchmark_iterations_; ++i) + CalcFrameSsim(src_a, benchmark_width_, + src_b, benchmark_width_, + benchmark_width_, benchmark_height_); - opt_time = (get_time() - opt_time) / _benchmark_iterations; + opt_time = (get_time() - opt_time) / benchmark_iterations_; printf("BenchmarkPsnr_OPT - %8.2f us opt\n", opt_time * 1e6); EXPECT_EQ(0, 0); diff --git a/unit_test/cpu_test.cc b/unit_test/cpu_test.cc index 5c0f83be1..e350561cd 100644 --- a/unit_test/cpu_test.cc +++ b/unit_test/cpu_test.cc @@ -8,14 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "unit_test.h" - #include #include #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/version.h" +#include "unit_test/unit_test.h" namespace libyuv { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 295413fd0..627acdfb7 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -8,8 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "unit_test.h" - #include #include @@ -17,6 +15,7 @@ #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#include "unit_test/unit_test.h" #if defined(_MSC_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var @@ -26,20 +25,20 @@ namespace libyuv { -TEST_F (libyuvTest, BenchmarkI420ToARGB_C) { - align_buffer_16(src_y, _benchmark_width * _benchmark_height); - align_buffer_16(src_u, ((_benchmark_width * _benchmark_height) >> 2)); - align_buffer_16(src_v, ((_benchmark_width * _benchmark_height) >> 2)); - align_buffer_16(dst_argb, ((_benchmark_width << 2) * _benchmark_height)); +TEST_F(libyuvTest, BenchmarkI420ToARGB_C) { + align_buffer_16(src_y, benchmark_width_ * benchmark_height_); + align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2); + align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2); + align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_); MaskCpuFlags(kCpuInitialized); - for (int i = 0; i < _benchmark_iterations; ++i) - I420ToARGB(src_y, _benchmark_width, - src_u, _benchmark_width >> 1, - src_v, _benchmark_width >> 1, - dst_argb, _benchmark_width << 2, - _benchmark_width, _benchmark_height); + for (int i = 0; i < benchmark_iterations_; ++i) + I420ToARGB(src_y, benchmark_width_, + src_u, benchmark_width_ >> 1, + src_v, benchmark_width_ >> 1, + dst_argb, benchmark_width_ << 2, + benchmark_width_, benchmark_height_); MaskCpuFlags(-1); @@ -51,18 +50,18 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_C) { free_aligned_buffer_16(dst_argb) } -TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) { - align_buffer_16(src_y, _benchmark_width * _benchmark_height); - align_buffer_16(src_u, (_benchmark_width * _benchmark_height) >> 2); - align_buffer_16(src_v, (_benchmark_width * _benchmark_height) >> 2); - align_buffer_16(dst_argb, (_benchmark_width << 2) * _benchmark_height); +TEST_F(libyuvTest, BenchmarkI420ToARGB_OPT) { + align_buffer_16(src_y, benchmark_width_ * benchmark_height_); + align_buffer_16(src_u, (benchmark_width_ * benchmark_height_) >> 2); + align_buffer_16(src_v, (benchmark_width_ * benchmark_height_) >> 2); + align_buffer_16(dst_argb, (benchmark_width_ << 2) * benchmark_height_); - for (int i = 0; i < _benchmark_iterations; ++i) - I420ToARGB(src_y, _benchmark_width, - src_u, _benchmark_width >> 1, - src_v, _benchmark_width >> 1, - dst_argb, _benchmark_width << 2, - _benchmark_width, _benchmark_height); + for (int i = 0; i < benchmark_iterations_; ++i) + I420ToARGB(src_y, benchmark_width_, + src_u, benchmark_width_ >> 1, + src_v, benchmark_width_ >> 1, + dst_argb, benchmark_width_ << 2, + benchmark_width_, benchmark_height_); free_aligned_buffer_16(src_y) free_aligned_buffer_16(src_u) @@ -71,7 +70,7 @@ TEST_F (libyuvTest, BenchmarkI420ToARGB_OPT) { } #define TESTI420TO(FMT) \ -TEST_F (libyuvTest, I420To##FMT##_CvsOPT) { \ +TEST_F(libyuvTest, I420To##FMT##_CvsOPT) { \ const int src_width = 1280; \ const int src_height = 720; \ align_buffer_16(src_y, src_width * src_height); \ @@ -103,8 +102,8 @@ TEST_F (libyuvTest, I420To##FMT##_CvsOPT) { \ int err = 0; \ for (int i = 0; i < src_height; ++i) { \ for (int j = 0; j < src_width << 2; ++j) { \ - int diff = (int)(dst_rgb_c[i * src_height + j]) - \ - (int)(dst_rgb_opt[i * src_height + j]); \ + int diff = static_cast(dst_rgb_c[i * src_height + j]) - \ + static_cast(dst_rgb_opt[i * src_height + j]); \ if (abs(diff) > 2) \ err++; \ } \ @@ -121,11 +120,48 @@ TESTI420TO(ARGB) TESTI420TO(BGRA) TESTI420TO(ABGR) -TEST_F (libyuvTest, TestAttenuate) { +TEST_F(libyuvTest, TestAttenuate) { SIMD_ALIGNED(uint8 orig_pixels[256][4]); SIMD_ALIGNED(uint8 atten_pixels[256][4]); SIMD_ALIGNED(uint8 unatten_pixels[256][4]); SIMD_ALIGNED(uint8 atten2_pixels[256][4]); + + // Test unattenuation clamps + orig_pixels[0][0] = 200u; + orig_pixels[0][1] = 129u; + orig_pixels[0][2] = 127u; + orig_pixels[0][3] = 128u; + // Test unattenuation transparent and opaque are unaffected + orig_pixels[1][0] = 16u; + orig_pixels[1][1] = 64u; + orig_pixels[1][2] = 192u; + orig_pixels[1][3] = 0u; + orig_pixels[2][0] = 16u; + orig_pixels[2][1] = 64u; + orig_pixels[2][2] = 192u; + orig_pixels[2][3] = 255u; + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 128u; + ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 4, 1); + EXPECT_EQ(255u, unatten_pixels[0][0]); + EXPECT_EQ(255u, unatten_pixels[0][1]); + EXPECT_EQ(254u, unatten_pixels[0][2]); + EXPECT_EQ(128u, unatten_pixels[0][3]); + EXPECT_EQ(16u, unatten_pixels[1][0]); + EXPECT_EQ(64u, unatten_pixels[1][1]); + EXPECT_EQ(192u, unatten_pixels[1][2]); + EXPECT_EQ(0u, unatten_pixels[1][3]); + EXPECT_EQ(16u, unatten_pixels[2][0]); + EXPECT_EQ(64u, unatten_pixels[2][1]); + EXPECT_EQ(192u, unatten_pixels[2][2]); + EXPECT_EQ(255u, unatten_pixels[2][3]); + EXPECT_EQ(32u, unatten_pixels[3][0]); + EXPECT_EQ(128u, unatten_pixels[3][1]); + EXPECT_EQ(255u, unatten_pixels[3][2]); + EXPECT_EQ(128u, unatten_pixels[3][3]); + for (int i = 0; i < 256; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; @@ -156,17 +192,5 @@ TEST_F (libyuvTest, TestAttenuate) { EXPECT_EQ(127, atten_pixels[255][1]); EXPECT_EQ(85, atten_pixels[255][2]); EXPECT_EQ(255, atten_pixels[255][3]); - - // Test unattenuation clamps - orig_pixels[0][0] = 200; - orig_pixels[0][1] = 129; - orig_pixels[0][2] = 127; - orig_pixels[0][3] = 128; - ARGBUnattenuate(&orig_pixels[0][0], 0, &unatten_pixels[0][0], 0, 1, 1); - EXPECT_EQ(255, unatten_pixels[0][0]); - EXPECT_EQ(255, unatten_pixels[0][1]); - EXPECT_EQ(254, unatten_pixels[0][2]); - EXPECT_EQ(128, unatten_pixels[0][3]); } - } diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index 0c134d58b..f29d03de5 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -8,13 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "unit_test.h" - #include #include -#include "libyuv/rotate.h" #include "../source/rotate_priv.h" +#include "libyuv/rotate.h" +#include "unit_test/unit_test.h" namespace libyuv { @@ -33,8 +32,8 @@ TEST_F(libyuvTest, Transpose) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = ih; oh = iw; @@ -77,8 +76,8 @@ TEST_F(libyuvTest, TransposeUV) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = ih; @@ -134,8 +133,8 @@ TEST_F(libyuvTest, RotatePlane90) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = ih; @@ -191,8 +190,8 @@ TEST_F(libyuvTest, RotateUV90) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = ih; @@ -265,8 +264,8 @@ TEST_F(libyuvTest, RotateUV180) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = iw >> 1; @@ -339,8 +338,8 @@ TEST_F(libyuvTest, RotateUV270) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 2) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 2) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = ih; @@ -414,8 +413,8 @@ TEST_F(libyuvTest, RotatePlane180) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = iw; @@ -459,8 +458,8 @@ TEST_F(libyuvTest, RotatePlane270) { int iw, ih, ow, oh; int err = 0; - for (iw = 8; iw < _rotate_max_w && !err; ++iw) - for (ih = 8; ih < _rotate_max_h && !err; ++ih) { + for (iw = 8; iw < rotate_max_w_ && !err; ++iw) + for (ih = 8; ih < rotate_max_h_ && !err; ++ih) { int i; ow = ih; @@ -516,8 +515,8 @@ TEST_F(libyuvTest, RotatePlane90and270) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 4) - for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) + for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) { int i; ow = ih; @@ -561,8 +560,8 @@ TEST_F(libyuvTest, RotatePlane90Pitch) { int iw, ih; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 4) - for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) + for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) { int i; int ow = ih; @@ -618,8 +617,8 @@ TEST_F(libyuvTest, RotatePlane270Pitch) { int iw, ih, ow, oh; int err = 0; - for (iw = 16; iw < _rotate_max_w && !err; iw += 4) - for (ih = 16; ih < _rotate_max_h && !err; ih += 4) { + for (iw = 16; iw < rotate_max_w_ && !err; iw += 4) + for (ih = 16; ih < rotate_max_h_ && !err; ih += 4) { int i; ow = ih; diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index af715d5b4..c3e0c6dfe 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -8,13 +8,12 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "unit_test.h" - #include #include #include "libyuv/cpu_id.h" #include "libyuv/scale.h" +#include "unit_test/unit_test.h" namespace libyuv { diff --git a/unit_test/unit_test.cc b/unit_test/unit_test.cc index 35a447351..d6ea03122 100644 --- a/unit_test/unit_test.cc +++ b/unit_test/unit_test.cc @@ -8,15 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include -#include "unit_test.h" +#include "unit_test/unit_test.h" -libyuvTest::libyuvTest() : - _rotate_max_w(128), - _rotate_max_h(128), - _benchmark_iterations(1000), - _benchmark_width(1280), - _benchmark_height(720) { +#include + +libyuvTest::libyuvTest() : rotate_max_w_(128), rotate_max_h_(128), + benchmark_iterations_(1000), benchmark_width_(1280), + benchmark_height_(720) { } int main(int argc, char** argv) { diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index 8b082a1fe..6e128e83a 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -8,17 +8,17 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef UINIT_TEST_H_ -#define UINIT_TEST_H_ +#ifndef UNIT_TEST_UNIT_TEST_H_ +#define UNIT_TEST_UNIT_TEST_H_ #include -#define align_buffer_16(var, size) \ - uint8 *var; \ - uint8 *var##_mem; \ - var##_mem = reinterpret_cast(calloc((size)+15, sizeof(uint8))); \ - var = reinterpret_cast \ - ((reinterpret_cast(var##_mem) + 15) & (~0x0f)); +#define align_buffer_16(var, size) \ + uint8* var; \ + uint8* var##_mem; \ + var##_mem = reinterpret_cast(calloc((size) + 15, sizeof(uint8))); \ + var = reinterpret_cast \ + ((reinterpret_cast(var##_mem) + 15) & (~0x0f)); \ #define free_aligned_buffer_16(var) \ free(var##_mem); \ @@ -27,12 +27,11 @@ #ifdef WIN32 #include -static double get_time() -{ - LARGE_INTEGER t, f; - QueryPerformanceCounter(&t); - QueryPerformanceFrequency(&f); - return double(t.QuadPart)/double(f.QuadPart); +static double get_time() { + LARGE_INTEGER t, f; + QueryPerformanceCounter(&t); + QueryPerformanceFrequency(&f); + return static_cast(t.QuadPart) / static_cast(f.QuadPart); } #define random rand @@ -46,7 +45,7 @@ static double get_time() { struct timeval t; struct timezone tzp; gettimeofday(&t, &tzp); - return t.tv_sec + t.tv_usec*1e-6; + return t.tv_sec + t.tv_usec * 1e-6; } #endif @@ -55,13 +54,12 @@ class libyuvTest : public ::testing::Test { protected: libyuvTest(); - const int _rotate_max_w; - const int _rotate_max_h; - - const int _benchmark_iterations; - const int _benchmark_width; - const int _benchmark_height; + const int rotate_max_w_; + const int rotate_max_h_; + const int benchmark_iterations_; + const int benchmark_width_; + const int benchmark_height_; }; -#endif // UNIT_TEST_H_ +#endif // UNIT_TEST_UNIT_TEST_H_