From 221e602f8a726f7457a0d521b5bcca05d89215bb Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 21 May 2012 22:24:41 +0000 Subject: [PATCH] ARGBSepia planarfunction for converting a region of ARGB image to Sepia tone. ARGBGray optimized weaving of alpha value. 551 ms from 568 ms. BUG=none TEST=libyuv_unittest --gtest_filter=*ARGBSepia* Review URL: https://webrtc-codereview.appspot.com/573008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@270 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 5 ++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 22 ++++++ source/row.h | 4 + source/row_common.cc | 23 ++++++ source/row_posix.cc | 118 ++++++++++++++++++++++++------ source/row_win.cc | 113 ++++++++++++++++++++++------ unit_test/planar_test.cc | 57 ++++++++++++++- 9 files changed, 297 insertions(+), 49 deletions(-) diff --git a/README.chromium b/README.chromium index 27dbfd4db..9499fb09b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 269 +Version: 270 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index b30ca3e2b..1ffa5bd02 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -138,6 +138,11 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, int x, int y, int width, int height); +// Make a rectangle of ARGB Sepia tone. +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int x, int y, + int width, int height); + // Copy ARGB to ARGB. int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8a5df622f..efe352d24 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 269 +#define LIBYUV_VERSION 270 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d2791ca94..9af76b690 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1059,6 +1059,28 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, return 0; } +// Make a rectangle of ARGB Sepia tone. +int ARGBSepia(uint8* dst_argb, int dst_stride_argb, + int dst_x, int dst_y, + int width, int height) { + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C; +#if defined(HAS_ARGBSEPIAROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBSepiaRow = ARGBSepiaRow_SSSE3; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBSepiaRow(dst, width); + dst += dst_stride_argb; + } + return 0; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row.h b/source/row.h index d8604f745..3c44d416d 100644 --- a/source/row.h +++ b/source/row.h @@ -72,6 +72,7 @@ extern "C" { #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 #define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBSEPIAROW_SSSE3 #endif // The following are available only useful when SSSE3 is unavailable. @@ -403,6 +404,9 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBGrayRow_C(uint8* dst_argb, int width); void ARGBGrayRow_SSSE3(uint8* dst_argb, int width); +void ARGBSepiaRow_C(uint8* dst_argb, int width); +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 45fe9f8af..cb1a6e2c7 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -249,6 +249,29 @@ void ARGBGrayRow_C(uint8* dst_argb, int width) { } } +// Convert a row of image to Sepia tone. +void ARGBSepiaRow_C(uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int sb = (r * 35 + g * 68 + b * 17) >> 7; + int sg = (r * 45 + g * 88 + b * 22) >> 7; + int sr = (r * 50 + g * 98 + b * 24) >> 7; + if (sr > 255) { + sr = 255; + } + if (sg > 255) { + sg = 255; + } + // b does not over flow. a is preserved from original. + dst_argb[0] = sb; + dst_argb[1] = sg; + dst_argb[2] = sr; + dst_argb += 4; + } +} + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { diff --git a/source/row_posix.cc b/source/row_posix.cc index 340dd651f..1e61e1fea 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -108,11 +108,6 @@ CONST uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; -// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R -CONST vec8 kARGBToGray = { - 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 -}; - void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" @@ -2533,15 +2528,15 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #endif // HAS_ARGBUNATTENUATE_SSE2 #ifdef HAS_ARGBGRAYROW_SSSE3 +// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R +CONST vec8 kARGBToGray = { + 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 +}; + // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) { asm volatile ( "movdqa %2,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" - "pslld $0x18,%%xmm5 \n" - "pcmpeqb %%xmm3,%%xmm3 \n" - "psrld $0x8,%%xmm3 \n" - // 8 pixel loop \n" ".p2align 4 \n" "1: \n" @@ -2549,36 +2544,113 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) { "movdqa 0x10(%0),%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" - "movdqa (%0),%%xmm6 \n" - "movdqa 0x10(%0),%%xmm7 \n" - "pand %%xmm5,%%xmm6 \n" - "pand %%xmm5,%%xmm7 \n" "phaddw %%xmm1,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" + "movdqa (%0),%%xmm2 \n" + "movdqa 0x10(%0),%%xmm3 \n" + "psrld $0x18,%%xmm2 \n" + "psrld $0x18,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" + "movdqa %%xmm0,%%xmm3 \n" "punpcklbw %%xmm0,%%xmm0 \n" + "punpcklbw %%xmm2,%%xmm3 \n" "movdqa %%xmm0,%%xmm1 \n" - "punpcklwd %%xmm0,%%xmm0 \n" - "punpckhwd %%xmm1,%%xmm1 \n" - "pand %%xmm3,%%xmm0 \n" - "pand %%xmm3,%%xmm1 \n" - "por %%xmm6,%%xmm0 \n" - "por %%xmm7,%%xmm1 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm1 \n" "sub $0x8,%1 \n" "movdqa %%xmm0,(%0) \n" "movdqa %%xmm1,0x10(%0) \n" "lea 0x20(%0),%0 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 + : "+r"(dst_argb), // %0 + "+r"(width) // %1 : "m"(kARGBToGray) // %2 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" #endif ); } #endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +CONST vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +CONST vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +CONST vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + asm volatile ( + "movdqa %2,%%xmm2 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm4 \n" + // 8 pixel loop \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "sub $0x8,%1 \n" + "movdqa %%xmm0,(%0) \n" + "movdqa %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "m"(kARGBToSepiaB), // %2 + "m"(kARGBToSepiaG), // %3 + "m"(kARGBToSepiaR) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBSEPIAROW_SSSE3 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index eb49ac46c..89eacf9d0 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -98,11 +98,6 @@ static const uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; -// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R -static const vec8 kARGBToGray = { - 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 -}; - __declspec(naked) __declspec(align(16)) void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { @@ -2558,6 +2553,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #endif // HAS_ARGBUNATTENUATE_SSE2 #ifdef HAS_ARGBGRAYROW_SSSE3 +// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R +static const vec8 kARGBToGray = { + 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 +}; + // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels __declspec(naked) __declspec(align(16)) void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) { @@ -2565,33 +2565,28 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) { mov eax, [esp + 4] /* dst_argb */ mov ecx, [esp + 8] /* width */ movdqa xmm4, kARGBToGray - pcmpeqb xmm5, xmm5 // generate mask 0xff000000 - pslld xmm5, 24 - pcmpeqb xmm3, xmm3 // generate mask 0x00ffffff - psrld xmm3, 8 align 16 convertloop: - movdqa xmm0, [eax] + movdqa xmm0, [eax] // G movdqa xmm1, [eax + 16] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 - movdqa xmm6, [eax] // preserve alpha - movdqa xmm7, [eax + 16] - pand xmm6, xmm5 - pand xmm7, xmm5 phaddw xmm0, xmm1 psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 Y values - - punpcklbw xmm0, xmm0 + packuswb xmm0, xmm0 // 8 G bytes + movdqa xmm2, [eax] // A + movdqa xmm3, [eax + 16] + psrld xmm2, 24 + psrld xmm3, 24 + packuswb xmm2, xmm3 + packuswb xmm2, xmm2 // 8 A bytes + movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA + punpcklbw xmm0, xmm0 // 8 GG words + punpcklbw xmm3, xmm2 // 8 GA words movdqa xmm1, xmm0 - punpcklwd xmm0, xmm0 - punpckhwd xmm1, xmm1 - pand xmm0, xmm3 // mask in alpha - pand xmm1, xmm3 - por xmm0, xmm6 - por xmm1, xmm7 + punpcklwd xmm0, xmm3 // GGGA first 4 + punpckhwd xmm1, xmm3 // GGGA next 4 sub ecx, 8 movdqa [eax], xmm0 movdqa [eax + 16], xmm1 @@ -2601,8 +2596,80 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) { } } #endif // HAS_ARGBGRAYROW_SSSE3 + +#ifdef HAS_ARGBSEPIAROW_SSSE3 +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +// Constant for ARGB color to sepia tone +static const vec8 kARGBToSepiaB = { + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 +}; + +static const vec8 kARGBToSepiaG = { + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 +}; + +static const vec8 kARGBToSepiaR = { + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 +}; + +// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels +__declspec(naked) __declspec(align(16)) +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov ecx, [esp + 8] /* width */ + movdqa xmm2, kARGBToSepiaB + movdqa xmm3, kARGBToSepiaG + movdqa xmm4, kARGBToSepiaR + + align 16 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqa xmm5, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqa xmm5, [eax] // R + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqa xmm6, [eax] // A + movdqa xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + sub ecx, 8 + movdqa [eax], xmm0 + movdqa [eax + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBSEPIAROW_SSSE3 #endif // _M_IX86 + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 86ada07f8..26084770f 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -244,7 +244,8 @@ TEST_F(libyuvTest, TestARGBGray) { orig_pixels[3][1] = 64u; orig_pixels[3][2] = 192u; orig_pixels[3][3] = 224u; - ARGBGray(&orig_pixels[0][0], 0, 0, 0, 4, 1); + // Do 16 to test asm version. + ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1); EXPECT_EQ(27u, orig_pixels[0][0]); EXPECT_EQ(27u, orig_pixels[0][1]); EXPECT_EQ(27u, orig_pixels[0][2]); @@ -273,4 +274,58 @@ TEST_F(libyuvTest, TestARGBGray) { ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1); } } + +TEST_F(libyuvTest, TestARGBSepia) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1); + EXPECT_EQ(33u, orig_pixels[0][0]); + EXPECT_EQ(43u, orig_pixels[0][1]); + EXPECT_EQ(47u, orig_pixels[0][2]); + EXPECT_EQ(128u, orig_pixels[0][3]); + EXPECT_EQ(135u, orig_pixels[1][0]); + EXPECT_EQ(175u, orig_pixels[1][1]); + EXPECT_EQ(195u, orig_pixels[1][2]); + EXPECT_EQ(0u, orig_pixels[1][3]); + EXPECT_EQ(69u, orig_pixels[2][0]); + EXPECT_EQ(89u, orig_pixels[2][1]); + EXPECT_EQ(99u, orig_pixels[2][2]); + EXPECT_EQ(255u, orig_pixels[2][3]); + EXPECT_EQ(88u, orig_pixels[3][0]); + EXPECT_EQ(114u, orig_pixels[3][1]); + EXPECT_EQ(127u, orig_pixels[3][2]); + EXPECT_EQ(224u, orig_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < 1000 * 1280 * 720 / 256; ++i) { + ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1); + } +} } // namespace libyuv