From e442dc4c2a896e85419628e3b7d97c4dfbe71c9d Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 18 Jun 2012 17:37:09 +0000 Subject: [PATCH] ARGBcolorMatrix for applying transforms such as grey and sepia in a more general form. Unittest does sepia for comparison. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/656004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@288 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 12 ++++ include/libyuv/version.h | 2 +- source/planar_functions.cc | 44 ++++++++++++ source/row.h | 15 ++++ source/row_common.cc | 67 ++++++++++++++++-- source/row_posix.cc | 65 ++++++++++++++++- source/row_win.cc | 113 +++++++++++++++++++++++++++++- unit_test/planar_test.cc | 62 ++++++++++++++++ 9 files changed, 371 insertions(+), 11 deletions(-) diff --git a/README.chromium b/README.chromium index 95f46b384..4dea75e1a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 287 +Version: 288 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 49a02cc6d..0076594df 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -216,6 +216,18 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb, int ARGBSepia(uint8* dst_argb, int dst_stride_argb, int x, int y, int width, int height); +// Apply a 4x3 matrix rotation to each ARGB pixel. +// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. +int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int x, int y, int width, int height); + +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int x, int y, int width, int height); + // Copy ARGB to ARGB. int ARGBCopy(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 528de2bf8..6cf7639c0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 287 +#define LIBYUV_VERSION 288 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 0ec658ca7..49a3a16eb 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1468,7 +1468,51 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, return 0; } +// Apply a 4x3 matrix rotation to each ARGB pixel. +int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_argb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb, + int width) = ARGBColorMatrixRow_C; +#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBColorMatrixRow(dst, matrix_argb, width); + dst += dst_stride_argb; + } + return 0; +} +// Apply a color table each ARGB pixel. +// Table contains 256 ARGB values. +int ARGBColorTable(uint8* dst_argb, int dst_stride_argb, + const uint8* table_argb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { + return -1; + } + void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb, + int width) = ARGBColorTableRow_C; +#if defined(HAS_ARGBCOLORTABLEROW_X86) + if (TestCpuFlag(kCpuHasX86)) { + ARGBColorTableRow = ARGBColorTableRow_X86; + } +#endif + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + for (int y = 0; y < height; ++y) { + ARGBColorTableRow(dst, table_argb, width); + dst += dst_stride_argb; + } + return 0; +} #ifdef HAVE_JPEG struct ARGBBuffers { uint8* argb; diff --git a/source/row.h b/source/row.h index c9e0c12f6..1648a047b 100644 --- a/source/row.h +++ b/source/row.h @@ -75,10 +75,17 @@ extern "C" { #define HAS_YUY2TOYROW_SSE2 #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2 #endif +// The following are Windows only: +#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) +#define HAS_ARGBCOLORTABLEROW_X86 +#endif + + // The following are disabled when SSSE3 is available: #if !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ @@ -482,6 +489,14 @@ void ARGBGrayRow_SSSE3(uint8* dst_argb, int width); void ARGBSepiaRow_C(uint8* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); +void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, + int width); + +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width); + // Used for blur. void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, int width, int area, uint8* dst, int count); diff --git a/source/row_common.cc b/source/row_common.cc index 85cb70d3b..741c8f578 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -293,16 +293,54 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { int b = dst_argb[0]; int g = dst_argb[1]; int r = dst_argb[2]; - int sb = (r * 35 + g * 68 + b * 17) >> 7; - int sg = (r * 45 + g * 88 + b * 22) >> 7; - int sr = (r * 50 + g * 98 + b * 24) >> 7; + int sb = (b * 17 + g * 68 + r * 35) >> 7; + int sg = (b * 22 + g * 88 + r * 45) >> 7; + int sr = (b * 24 + g * 98 + r * 50) >> 7; + // b does not over flow. a is preserved from original. + if (sg > 255) { + sg = 255; + } if (sr > 255) { sr = 255; } + dst_argb[0] = sb; + dst_argb[1] = sg; + dst_argb[2] = sr; + dst_argb += 4; + } +} + +// Apply color matrix to a row of image. Matrix is signed. +void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + int sb = (b * matrix_argb[0] + g * matrix_argb[1] + + r * matrix_argb[2] + a * matrix_argb[3]) >> 7; + int sg = (b * matrix_argb[4] + g * matrix_argb[5] + + r * matrix_argb[6] + a * matrix_argb[7]) >> 7; + int sr = (b * matrix_argb[8] + g * matrix_argb[9] + + r * matrix_argb[10] + a * matrix_argb[11]) >> 7; + if (sb < 0) { + sb = 0; + } + if (sb > 255) { + sb = 255; + } + if (sg < 0) { + sg = 0; + } if (sg > 255) { sg = 255; } - // b does not over flow. a is preserved from original. + if (sr < 0) { + sr = 0; + } + if (sr > 255) { + sr = 255; + } dst_argb[0] = sb; dst_argb[1] = sg; dst_argb[2] = sr; @@ -310,6 +348,21 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { } } +// Apply color table to a row of image. +void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) { + for (int x = 0; x < width; ++x) { + int b = dst_argb[0]; + int g = dst_argb[1]; + int r = dst_argb[2]; + int a = dst_argb[3]; + dst_argb[0] = table_argb[b * 4 + 0]; + dst_argb[1] = table_argb[g * 4 + 1]; + dst_argb[2] = table_argb[r * 4 + 2]; + dst_argb[3] = table_argb[a * 4 + 3]; + dst_argb += 4; + } +} + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { @@ -790,9 +843,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) #endif #if defined(HAS_I422TOARGBROW_NEON) -YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C) -YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C) -YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C) +YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1) +YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) +YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) #endif #undef YANY diff --git a/source/row_posix.cc b/source/row_posix.cc index e69779d46..90adcb888 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2800,7 +2800,7 @@ CONST vec8 kARGBToSepiaR = { 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 }; -// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { asm volatile ( "movdqa %2,%%xmm2 \n" @@ -2859,6 +2859,69 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { } #endif // HAS_ARGBSEPIAROW_SSSE3 +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, + int width) { + asm volatile ( + "movd (%2),%%xmm2 \n" + "movd 0x4(%2),%%xmm3 \n" + "movd 0x8(%2),%%xmm4 \n" + "pshufd $0x0,%%xmm2,%%xmm2 \n" + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + // 8 pixel loop \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm6 \n" + "pmaddubsw %%xmm2,%%xmm0 \n" + "pmaddubsw %%xmm2,%%xmm6 \n" + "phaddw %%xmm6,%%xmm0 \n" + "psrlw $0x7,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm5 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa (%0),%%xmm5 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm5 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "phaddw %%xmm1,%%xmm5 \n" + "psrlw $0x7,%%xmm5 \n" + "packuswb %%xmm5,%%xmm5 \n" + "movdqa (%0),%%xmm6 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "psrld $0x18,%%xmm6 \n" + "psrld $0x18,%%xmm1 \n" + "packuswb %%xmm1,%%xmm6 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm5 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "punpckhwd %%xmm5,%%xmm1 \n" + "sub $0x8,%1 \n" + "movdqa %%xmm0,(%0) \n" + "movdqa %%xmm1,0x10(%0) \n" + "lea 0x20(%0),%0 \n" + "jg 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : "r"(matrix_argb) // %2 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. diff --git a/source/row_win.cc b/source/row_win.cc index 6a710672a..85ba1a707 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2877,7 +2877,7 @@ static const vec8 kARGBToSepiaR = { 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 }; -// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. __declspec(naked) __declspec(align(16)) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { __asm { @@ -2930,6 +2930,117 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { } } #endif // HAS_ARGBSEPIAROW_SSSE3 +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. +// Same as Sepia except matrix is provided. +// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R +// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. +// TODO(fbarchard): phaddw not paired. +// TODO(fbarchard): Test data copying from mem instead of from reg. +// TODO(fbarchard): packing and then unpacking the A - is simple pand/por faster +__declspec(naked) __declspec(align(16)) +void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, + int width) { + __asm { + mov eax, [esp + 4] /* dst_argb */ + mov edx, [esp + 8] /* matrix_argb */ + mov ecx, [esp + 12] /* width */ + movd xmm2, [edx] + movd xmm3, [edx + 4] + movd xmm4, [edx + 8] + pshufd xmm2, xmm2, 0 + pshufd xmm3, xmm3, 0 + pshufd xmm4, xmm4, 0 + + align 16 + convertloop: + movdqa xmm0, [eax] // B + movdqa xmm6, [eax + 16] + pmaddubsw xmm0, xmm2 + pmaddubsw xmm6, xmm2 + phaddw xmm0, xmm6 + psrlw xmm0, 7 + packuswb xmm0, xmm0 // 8 B values + movdqa xmm5, [eax] // G + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm3 + pmaddubsw xmm1, xmm3 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values + movdqa xmm5, [eax] // R + movdqa xmm1, [eax + 16] + pmaddubsw xmm5, xmm4 + pmaddubsw xmm1, xmm4 + phaddw xmm5, xmm1 + psrlw xmm5, 7 + packuswb xmm5, xmm5 // 8 R values + movdqa xmm6, [eax] // A + movdqa xmm1, [eax + 16] + psrld xmm6, 24 + psrld xmm1, 24 + packuswb xmm6, xmm1 + packuswb xmm6, xmm6 // 8 A values + punpcklbw xmm5, xmm6 // 8 RA values + movdqa xmm1, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm5 // BGRA first 4 + punpckhwd xmm1, xmm5 // BGRA next 4 + sub ecx, 8 + movdqa [eax], xmm0 + movdqa [eax + 16], xmm1 + lea eax, [eax + 32] + jg convertloop + ret + } +} +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 + +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { + __asm { + push ebx + push edi + push ebp + mov eax, [esp + 12 + 4] /* dst_argb */ + mov edi, [esp + 12 + 8] /* table_argb */ + mov ecx, [esp + 12 + 12] /* width */ + xor ebx, ebx + xor edx, edx + + align 16 + convertloop: + mov ebp, dword ptr [eax] // BGRA + mov esi, ebp + and ebp, 255 + shr esi, 8 + and esi, 255 + mov bl, [edi + ebp * 4 + 0] // B + mov dl, [edi + esi * 4 + 1] // G + mov ebp, dword ptr [eax] // BGRA + mov esi, ebp + shr ebp, 16 + shr esi, 24 + and ebp, 255 + mov [eax], bl + mov [eax + 1], dl + mov bl, [edi + ebp * 4 + 2] // R + mov dl, [edi + esi * 4 + 3] // A + mov [eax + 2], bl + mov [eax + 3], dl + lea eax, [eax + 4] + sub ecx, 1 + jg convertloop + pop ebp + pop edi + pop ebx + ret + } +} +#endif // HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 // Consider float CumulativeSum. diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index cb863cca0..d50e7cfd7 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -487,4 +487,66 @@ TEST_F(libyuvTest, TestARGBSepia) { ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 256, 1); } } + +TEST_F(libyuvTest, TestARGBColorMatrix) { + SIMD_ALIGNED(uint8 orig_pixels[256][4]); + + // Matrix for Sepia. + static const int8 kARGBToSepiaB[] = { + 17, 68, 35, 0, + 22, 88, 45, 0, + 24, 98, 50, 0, + }; + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepiaB[0], 0, 0, 16, 1); + EXPECT_EQ(33u, orig_pixels[0][0]); + EXPECT_EQ(43u, orig_pixels[0][1]); + EXPECT_EQ(47u, orig_pixels[0][2]); + EXPECT_EQ(128u, orig_pixels[0][3]); + EXPECT_EQ(135u, orig_pixels[1][0]); + EXPECT_EQ(175u, orig_pixels[1][1]); + EXPECT_EQ(195u, orig_pixels[1][2]); + EXPECT_EQ(0u, orig_pixels[1][3]); + EXPECT_EQ(69u, orig_pixels[2][0]); + EXPECT_EQ(89u, orig_pixels[2][1]); + EXPECT_EQ(99u, orig_pixels[2][2]); + EXPECT_EQ(255u, orig_pixels[2][3]); + EXPECT_EQ(88u, orig_pixels[3][0]); + EXPECT_EQ(114u, orig_pixels[3][1]); + EXPECT_EQ(127u, orig_pixels[3][2]); + EXPECT_EQ(224u, orig_pixels[3][3]); + + for (int i = 0; i < 256; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + + for (int i = 0; i < 1000 * 1280 * 720 / 256; ++i) { + ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepiaB[0], 0, 0, 256, 1); + } +} + } // namespace libyuv