From 8ed54222e723037322579f15c36d4faddb924e91 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 18 Apr 2012 17:07:07 +0000 Subject: [PATCH] Attenuation ported to SSE2 BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/485009 git-svn-id: http://libyuv.googlecode.com/svn/trunk@242 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 79 ++++++++++++++++++++++++++++---------- source/row.h | 4 ++ source/row_common.cc | 36 +++++++++++++++++ source/row_posix.cc | 47 ++++++++++++++++++++++- source/row_win.cc | 44 +++++++++++++++++++++ unit_test/planar_test.cc | 27 ++++++++----- 8 files changed, 207 insertions(+), 34 deletions(-) diff --git a/README.chromium b/README.chromium index 0c0fc1381..401ee75d9 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 241 +Version: 242 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1bf959c68..70d29b9b4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 241 +#define LIBYUV_VERSION 242 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0145fcf90..eb946745a 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -863,24 +863,6 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, return 0; } -// Multiply source RGB by alpha and store to destination. -// b = (b * a + 127) / 255; -static void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, - int width) { - for (int i = 0; i < width; ++i) { - const uint32 b = src_argb[0]; - const uint32 g = src_argb[1]; - const uint32 r = src_argb[2]; - const uint32 a = src_argb[3]; - dst_argb[0] = (b * a + 127) / 255; - dst_argb[1] = (g * a + 127) / 255; - dst_argb[2] = (r * a + 127) / 255; - dst_argb[3] = a; - src_argb += 4; - dst_argb += 4; - } -} - // Convert unattentuated ARGB values to preattenuated ARGB. // An unattenutated ARGB alpha blend uses the formula // p = a * f + (1 - a) * b @@ -902,9 +884,18 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } + void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBAttenuateRow_C; +#if defined(HAS_ARGBATTENUATE_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSE2; + } +#endif for (int y = 0; y < height; ++y) { - ARGBAttenuateRow_C(src_argb, dst_argb, width); + ARGBAttenuateRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } @@ -916,6 +907,43 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb, // g = (g * 255 + (a / 2)) / a; // r = (r * 255 + (a / 2)) / a; // Reciprocal method is off by 1 on some values. ie 125 +// 8.16 fixed point inverse table +#define T(a) 0x1000000 / a +static uint32 fixed_invtbl[256] = { + 0, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), + T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), + T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), + T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), + T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), + T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), + T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), + T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), + T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), + T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), + T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), + T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), + T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), + T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), + T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), + T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), + T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), + T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), + T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), + T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), + T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), + T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), + T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), + T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), + T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), + T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), + T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), + T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), + T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), + T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), + T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), T(0xff) }; +#undef T + static void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { for (int i = 0; i < width; ++i) { @@ -924,7 +952,7 @@ static void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, uint32 r = src_argb[2]; const uint32 a = src_argb[3]; if (a) { - const uint32 ia = (0x1000000 + (a >> 1)) / a; // 8.16 fixed point + const uint32 ia = fixed_invtbl[a]; // 8.16 fixed point b = (b * ia + 0x8000) >> 16; g = (g * ia + 0x8000) >> 16; r = (r * ia + 0x8000) >> 16; @@ -957,9 +985,18 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } + void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb, + int width) = ARGBUnattenuateRow_C; +#if defined(HAS_ARGBUNATTENUATE_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; + } +#endif for (int y = 0; y < height; ++y) { - ARGBUnattenuateRow_C(src_argb, dst_argb, width); + ARGBUnattenuateRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } diff --git a/source/row.h b/source/row.h index a541c7621..895517a6e 100644 --- a/source/row.h +++ b/source/row.h @@ -66,6 +66,7 @@ extern "C" { #define HAS_UYVYTOUVROW_SSE2 #define HAS_ARGBBLENDROW_SSE2 #define HAS_ARGBBLENDROW_SSSE3 +#define HAS_ARGBATTENUATE_SSE2 #endif // The following are available on Neon platforms @@ -360,6 +361,9 @@ void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index e61fb0dc8..3df51bfd1 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -664,6 +664,42 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) #undef UVANY #endif +// Multiply source RGB by alpha and store to destination. +// b = (b * a + 127) / 255; +void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { + for (int i = 0; i < width - 1; i += 2) { + uint32 b = src_argb[0]; + uint32 g = src_argb[1]; + uint32 r = src_argb[2]; + uint32 a = src_argb[3]; + dst_argb[0] = (b * a + 255) >> 8; + dst_argb[1] = (g * a + 255) >> 8; + dst_argb[2] = (r * a + 255) >> 8; + dst_argb[3] = a; + b = src_argb[4]; + g = src_argb[5]; + r = src_argb[6]; + a = src_argb[7]; + dst_argb[4] = (b * a + 255) >> 8; + dst_argb[5] = (g * a + 255) >> 8; + dst_argb[6] = (r * a + 255) >> 8; + dst_argb[7] = a; + src_argb += 8; + dst_argb += 8; + } + + if (width & 1) { + const uint32 b = src_argb[0]; + const uint32 g = src_argb[1]; + const uint32 r = src_argb[2]; + const uint32 a = src_argb[3]; + dst_argb[0] = (b * a + 255) >> 8; + dst_argb[1] = (g * a + 255) >> 8; + dst_argb[2] = (r * a + 255) >> 8; + dst_argb[3] = a; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_posix.cc b/source/row_posix.cc index d7d174bc2..33828b65e 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2164,7 +2164,7 @@ CONST uvec8 kShuffleAlpha = { 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 }; void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { asm volatile ( "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" @@ -2232,6 +2232,51 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 +#ifdef HAS_ARGBATTENUATE_SSE2 +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + asm volatile ( + "sub %0,%1 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "pslld $0x18,%%xmm4 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrld $0x8,%%xmm5 \n" + // 4 pixel loop + "1: \n" + "movdqa (%0),%%xmm0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "pshufhw $0xff,%%xmm0,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "movdqa (%0),%%xmm1 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pshufhw $0xff,%%xmm1,%%xmm2 \n" + "pshuflw $0xff,%%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqa (%0),%%xmm3 \n" + "psrlw $0x8,%%xmm0 \n" + "pand %%xmm4,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pand %%xmm5,%%xmm0 \n" + "por %%xmm3,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif // HAS_ARGBATTENUATE_SSE2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 09c2eb162..e18141e3a 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2292,6 +2292,50 @@ void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1, } #endif // HAS_ARGBBLENDROW_SSSE3 +#ifdef HAS_ARGBATTENUATE_SSE2 +// Attenuate 4 pixels at a time. +// aligned to 16 bytes +__declspec(naked) __declspec(align(16)) +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + pcmpeqb xmm4, xmm4 // generate mask 0xff000000 + pslld xmm4, 24 + pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff + psrld xmm5, 8 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + punpcklbw xmm0, xmm0 // first 2 + pshufhw xmm2, xmm0,0FFh // 8 alpha words + pshuflw xmm2, xmm2,0FFh + pmulhuw xmm0, xmm2 // rgb * a + movdqa xmm1, [eax] // read 4 pixels + punpckhbw xmm1, xmm1 // next 2 pixels + pshufhw xmm2, xmm1,0FFh // 8 alpha words + pshuflw xmm2, xmm2,0FFh + pmulhuw xmm1, xmm2 // rgb * a + movdqa xmm3, [eax] // alphas + psrlw xmm0, 8 + pand xmm3, xmm4 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + pand xmm0, xmm5 // keep original alphas + por xmm0, xmm3 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} + +#endif // HAS_ARGBATTENUATE_SSE2 #endif // _M_IX86 #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 78a15abfe..295413fd0 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -18,6 +18,12 @@ #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" +#if defined(_MSC_VER) +#define SIMD_ALIGNED(var) __declspec(align(16)) var +#else // __GNUC__ +#define SIMD_ALIGNED(var) var __attribute__((aligned(16))) +#endif + namespace libyuv { TEST_F (libyuvTest, BenchmarkI420ToARGB_C) { @@ -116,10 +122,10 @@ TESTI420TO(BGRA) TESTI420TO(ABGR) TEST_F (libyuvTest, TestAttenuate) { - uint8 orig_pixels[256][4]; - uint8 atten_pixels[256][4]; - uint8 unatten_pixels[256][4]; - uint8 atten2_pixels[256][4]; + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + SIMD_ALIGNED(uint8 atten_pixels[256][4]); + SIMD_ALIGNED(uint8 unatten_pixels[256][4]); + SIMD_ALIGNED(uint8 atten2_pixels[256][4]); for (int i = 0; i < 256; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; @@ -128,13 +134,14 @@ TEST_F (libyuvTest, TestAttenuate) { } ARGBAttenuate(&orig_pixels[0][0], 0, &atten_pixels[0][0], 0, 256, 1); ARGBUnattenuate(&atten_pixels[0][0], 0, &unatten_pixels[0][0], 0, 256, 1); - ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1); - + for (int i = 0; i < 1000 * 1280 * 720 / 256; ++i) { + ARGBAttenuate(&unatten_pixels[0][0], 0, &atten2_pixels[0][0], 0, 256, 1); + } for (int i = 0; i < 256; ++i) { - EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 1); - EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 1); - EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 1); - EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 1); + EXPECT_NEAR(atten_pixels[i][0], atten2_pixels[i][0], 2); + EXPECT_NEAR(atten_pixels[i][1], atten2_pixels[i][1], 2); + EXPECT_NEAR(atten_pixels[i][2], atten2_pixels[i][2], 2); + EXPECT_NEAR(atten_pixels[i][3], atten2_pixels[i][3], 2); } // Make sure transparent, 50% and opaque are fully accurate. EXPECT_EQ(0, atten_pixels[0][0]);