From c99db063e24d6180740d4adc29e84159096eef2d Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 1 Oct 2013 01:27:30 +0000 Subject: [PATCH] Change ARGBColorMatrix to a 4x4. BUG=none TEST=planar_unitest updates R=johannkoenig@google.com, ryanpetrie@google.com, thorcarpenter@google.com Review URL: https://webrtc-codereview.appspot.com/2320008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@805 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 21 ++++++- include/libyuv/row.h | 11 ++-- include/libyuv/version.h | 2 +- source/planar_functions.cc | 67 +++++++++++++++++----- source/row_common.cc | 22 +++++--- source/row_neon.cc | 40 ++++++++----- source/row_posix.cc | 83 ++++++++++++++------------- source/row_win.cc | 73 +++++++++++++----------- unit_test/planar_test.cc | 94 ++++++++++++++++++++++++++++--- 10 files changed, 289 insertions(+), 126 deletions(-) diff --git a/README.chromium b/README.chromium index 08e3028e7..7eeb253f8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 804 +Version: 805 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 31490012c..7ca6e7fd8 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -186,15 +186,32 @@ LIBYUV_API int ARGBSepia(uint8* dst_argb, int dst_stride_argb, int x, int y, int width, int height); +// Deprecated. Use ARGBColorMatrix instead. // Apply a matrix rotation to each ARGB pixel. // matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. // The first 4 coefficients apply to B, G, R, A and produce B of the output. // The next 4 coefficients apply to B, G, R, A and produce G of the output. // The last 4 coefficients apply to B, G, R, A and produce R of the output. LIBYUV_API -int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int x, int y, int width, int height); + +// Temporary API mapper. +// #define ARGBColorMatrix(d, s, m, x, y, w, h) \ +// RGBColorMatrix(d, s, m, x, y, w, h) + +// Apply a matrix rotation to each ARGB pixel. +// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. +// The first 4 coefficients apply to B, G, R, A and produce B of the output. +// The next 4 coefficients apply to B, G, R, A and produce G of the output. +// The next 4 coefficients apply to B, G, R, A and produce R of the output. +// The last 4 coefficients apply to B, G, R, A and produce A of the output. +LIBYUV_API +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, const int8* matrix_argb, - int x, int y, int width, int height); + int width, int height); // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 2e78ba0b7..d2a23410b 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -1456,11 +1456,12 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width); void ARGBSepiaRow_NEON(uint8* dst_argb, int width); -void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width); -void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, - int width); -void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, - int width); +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width); void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width); void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index a451d05d8..92fd1129b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 804 +#define LIBYUV_VERSION 805 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 594641d8b..ee0487a73 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1345,24 +1345,31 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, return 0; } -// Apply a 4x3 matrix rotation to each ARGB pixel. +// Apply a 4x4 matrix to each ARGB pixel. +// Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API -int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, +int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, const int8* matrix_argb, - int dst_x, int dst_y, int width, int height) { - if (!dst_argb || !matrix_argb || width <= 0 || height <= 0 || - dst_x < 0 || dst_y < 0) { + int width, int height) { + if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } // Coalesce contiguous rows. - if (dst_stride_argb == width * 4) { - return ARGBColorMatrix(dst_argb, dst_stride_argb, + if (src_stride_argb == width * 4 && + dst_stride_argb == width * 4) { + return ARGBColorMatrix(src_argb, 0, + dst_argb, 0, matrix_argb, - dst_x, dst_y, width * height, 1); } - void (*ARGBColorMatrixRow)(uint8* dst_argb, const int8* matrix_argb, - int width) = ARGBColorMatrixRow_C; + void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) = ARGBColorMatrixRow_C; #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -1373,14 +1380,48 @@ int ARGBColorMatrix(uint8* dst_argb, int dst_stride_argb, ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } #endif - uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; for (int y = 0; y < height; ++y) { - ARGBColorMatrixRow(dst, matrix_argb, width); - dst += dst_stride_argb; + ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; } return 0; } +// Apply a 4x3 matrix to each ARGB pixel. +// Deprecated. +LIBYUV_API +int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb, + const int8* matrix_rgb, + int dst_x, int dst_y, int width, int height) { + if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || + dst_x < 0 || dst_y < 0) { + return -1; + } + + // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. + SIMD_ALIGNED(int8 matrix_argb[16]); + matrix_argb[0] = matrix_rgb[0] / 2; + matrix_argb[1] = matrix_rgb[1] / 2; + matrix_argb[2] = matrix_rgb[2] / 2; + matrix_argb[3] = matrix_rgb[3] / 2; + matrix_argb[4] = matrix_rgb[4] / 2; + matrix_argb[5] = matrix_rgb[5] / 2; + matrix_argb[6] = matrix_rgb[6] / 2; + matrix_argb[7] = matrix_rgb[7] / 2; + matrix_argb[8] = matrix_rgb[8] / 2; + matrix_argb[9] = matrix_rgb[9] / 2; + matrix_argb[10] = matrix_rgb[10] / 2; + matrix_argb[11] = matrix_rgb[11] / 2; + matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; + matrix_argb[15] = 64; // 1.0 + + uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; + return ARGBColorMatrix(const_cast(dst), dst_stride_argb, + dst, dst_stride_argb, + &matrix_argb[0], width, height); +} + // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API diff --git a/source/row_common.cc b/source/row_common.cc index 3ba0cdb69..f412336e5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -654,21 +654,27 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) { } // Apply color matrix to a row of image. Matrix is signed. -void ARGBColorMatrixRow_C(uint8* dst_argb, const int8* matrix_argb, int width) { +// TODO(fbarchard): Consider adding rounding (+32). +void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { for (int x = 0; x < width; ++x) { - int b = dst_argb[0]; - int g = dst_argb[1]; - int r = dst_argb[2]; - int a = dst_argb[3]; + int b = src_argb[0]; + int g = src_argb[1]; + int r = src_argb[2]; + int a = src_argb[3]; int sb = (b * matrix_argb[0] + g * matrix_argb[1] + - r * matrix_argb[2] + a * matrix_argb[3]) >> 7; + r * matrix_argb[2] + a * matrix_argb[3]) >> 6; int sg = (b * matrix_argb[4] + g * matrix_argb[5] + - r * matrix_argb[6] + a * matrix_argb[7]) >> 7; + r * matrix_argb[6] + a * matrix_argb[7]) >> 6; int sr = (b * matrix_argb[8] + g * matrix_argb[9] + - r * matrix_argb[10] + a * matrix_argb[11]) >> 7; + r * matrix_argb[10] + a * matrix_argb[11]) >> 6; + int sa = (b * matrix_argb[12] + g * matrix_argb[13] + + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; dst_argb[0] = Clamp(sb); dst_argb[1] = Clamp(sg); dst_argb[2] = Clamp(sr); + dst_argb[3] = Clamp(sa); + src_argb += 4; dst_argb += 4; } } diff --git a/source/row_neon.cc b/source/row_neon.cc index bf0167540..f721a94dd 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2474,18 +2474,19 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { } // Tranform 8 ARGB pixels (32 bytes) with color matrix. -// Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, - int width) { +// TODO(fbarchard): Was same as Sepia except matrix is provided. This function +// needs to saturate. Consider doing a non-saturating version. +void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { asm volatile ( - "vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors. + "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. - "vmovl.s8 q1, d5 \n" // R coefficients s16. + "vmovl.s8 q1, d5 \n" // R,A coefficients s16. ".p2align 2 \n" "1: \n" - "vld4.8 {d16, d18, d20, d22}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. + "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "vmovl.u8 q9, d18 \n" // g "vmovl.u8 q10, d20 \n" // r @@ -2493,33 +2494,42 @@ void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R + "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R + "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R + "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R + "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R + "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R - "vqshrun.s16 d16, q12, #7 \n" // 16 bit to 8 bit B - "vqshrun.s16 d18, q13, #7 \n" // 16 bit to 8 bit G - "vqshrun.s16 d20, q14, #7 \n" // 16 bit to 8 bit R - "vst4.8 {d16, d18, d20, d22}, [%0]! \n" // store 8 ARGB pixels. + "vqadd.s16 q15, q15, q7 \n" // Accumulate A + "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B + "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G + "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R + "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(matrix_argb) // %2 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q8", "q9", + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } diff --git a/source/row_posix.cc b/source/row_posix.cc index 56b34ff45..839c7e370 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4327,62 +4327,67 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. -void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, - int width) { +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { asm volatile ( - "movd "MEMACCESS(2)",%%xmm2 \n" - "movd "MEMACCESS2(0x4,2)",%%xmm3 \n" - "movd "MEMACCESS2(0x8,2)",%%xmm4 \n" + "movd "MEMACCESS(3)",%%xmm2 \n" + "movd "MEMACCESS2(0x4,3)",%%xmm3 \n" + "movd "MEMACCESS2(0x8,3)",%%xmm4 \n" + "movd "MEMACCESS2(0xc,3)",%%xmm5 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" // 8 pixel loop. ".p2align 4 \n" "1: \n" "movdqa "MEMACCESS(0)",%%xmm0 \n" - "movdqa "MEMACCESS2(0x10,0)",%%xmm6 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" - "pmaddubsw %%xmm2,%%xmm6 \n" - "movdqa "MEMACCESS(0)",%%xmm5 \n" - "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" - "pmaddubsw %%xmm3,%%xmm5 \n" - "pmaddubsw %%xmm3,%%xmm1 \n" - "phaddsw %%xmm6,%%xmm0 \n" - "phaddsw %%xmm1,%%xmm5 \n" - "psraw $0x7,%%xmm0 \n" - "psraw $0x7,%%xmm5 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm5,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm0 \n" - "movdqa "MEMACCESS(0)",%%xmm5 \n" - "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" - "pmaddubsw %%xmm4,%%xmm5 \n" - "pmaddubsw %%xmm4,%%xmm1 \n" - "phaddsw %%xmm1,%%xmm5 \n" - "psraw $0x7,%%xmm5 \n" - "packuswb %%xmm5,%%xmm5 \n" + "pmaddubsw %%xmm2,%%xmm7 \n" "movdqa "MEMACCESS(0)",%%xmm6 \n" "movdqa "MEMACCESS2(0x10,0)",%%xmm1 \n" - "psrld $0x18,%%xmm6 \n" - "psrld $0x18,%%xmm1 \n" - "packuswb %%xmm1,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "phaddsw %%xmm7,%%xmm0 \n" + "phaddsw %%xmm1,%%xmm6 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm6,%%xmm6 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm6,%%xmm5 \n" - "punpcklwd %%xmm5,%%xmm0 \n" - "punpckhwd %%xmm5,%%xmm1 \n" - "sub $0x8,%1 \n" - "movdqa %%xmm0,"MEMACCESS(0)" \n" - "movdqa %%xmm1,"MEMACCESS2(0x10,0)" \n" + "punpcklbw %%xmm6,%%xmm0 \n" + "movdqa "MEMACCESS(0)",%%xmm1 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm7 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm1 \n" + "movdqa "MEMACCESS(0)",%%xmm6 \n" + "movdqa "MEMACCESS2(0x10,0)",%%xmm7 \n" + "pmaddubsw %%xmm5,%%xmm6 \n" + "pmaddubsw %%xmm5,%%xmm7 \n" + "phaddsw %%xmm7,%%xmm6 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm6 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm6,%%xmm6 \n" + "punpcklbw %%xmm6,%%xmm1 \n" + "movdqa %%xmm0,%%xmm6 \n" + "punpcklwd %%xmm1,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm6 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0,"MEMACCESS(1)" \n" + "movdqa %%xmm6,"MEMACCESS2(0x10,1)" \n" "lea "MEMLEA(0x20,0)",%0 \n" + "lea "MEMLEA(0x20,1)",%1 \n" "jg 1b \n" - : "+r"(dst_argb), // %0 - "+r"(width) // %1 - : "r"(matrix_argb) // %2 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(matrix_argb) // %3 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" #endif ); } diff --git a/source/row_win.cc b/source/row_win.cc index 316f29df6..840eb7531 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5002,57 +5002,62 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. __declspec(naked) __declspec(align(16)) -void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, - int width) { +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + const int8* matrix_argb, int width) { __asm { - mov eax, [esp + 4] /* dst_argb */ - mov edx, [esp + 8] /* matrix_argb */ - mov ecx, [esp + 12] /* width */ - movd xmm2, [edx] - movd xmm3, [edx + 4] - movd xmm4, [edx + 8] + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 12] /* matrix_argb */ + movd xmm2, [ecx] + movd xmm3, [ecx + 4] + movd xmm4, [ecx + 8] + movd xmm5, [ecx + 12] pshufd xmm2, xmm2, 0 pshufd xmm3, xmm3, 0 pshufd xmm4, xmm4, 0 + pshufd xmm5, xmm5, 0 + mov ecx, [esp + 16] /* width */ align 16 convertloop: movdqa xmm0, [eax] // B - movdqa xmm6, [eax + 16] + movdqa xmm7, [eax + 16] pmaddubsw xmm0, xmm2 - pmaddubsw xmm6, xmm2 - movdqa xmm5, [eax] // G + pmaddubsw xmm7, xmm2 + movdqa xmm6, [eax] // G movdqa xmm1, [eax + 16] - pmaddubsw xmm5, xmm3 + pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm6 // B - phaddsw xmm5, xmm1 // G - psraw xmm0, 7 // B - psraw xmm5, 7 // G + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G packuswb xmm0, xmm0 // 8 B values - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values - movdqa xmm5, [eax] // R - movdqa xmm1, [eax + 16] - pmaddubsw xmm5, xmm4 + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values + movdqa xmm1, [eax] // R + movdqa xmm7, [eax + 16] pmaddubsw xmm1, xmm4 - phaddsw xmm5, xmm1 - psraw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + pmaddubsw xmm7, xmm4 + phaddsw xmm1, xmm7 // R movdqa xmm6, [eax] // A - movdqa xmm1, [eax + 16] - psrld xmm6, 24 - psrld xmm1, 24 - packuswb xmm6, xmm1 + movdqa xmm7, [eax + 16] + pmaddubsw xmm6, xmm5 + pmaddubsw xmm7, xmm5 + phaddsw xmm6, xmm7 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A + packuswb xmm1, xmm1 // 8 R values packuswb xmm6, xmm6 // 8 A values - movdqa xmm1, xmm0 // Weave BG, RA together - punpcklbw xmm5, xmm6 // 8 RA values - punpcklwd xmm0, xmm5 // BGRA first 4 - punpckhwd xmm1, xmm5 // BGRA next 4 + punpcklbw xmm1, xmm6 // 8 RA values + movdqa xmm6, xmm0 // Weave BG, RA together + punpcklwd xmm0, xmm1 // BGRA first 4 + punpckhwd xmm6, xmm1 // BGRA next 4 sub ecx, 8 - movdqa [eax], xmm0 - movdqa [eax + 16], xmm1 + movdqa [edx], xmm0 + movdqa [edx + 16], xmm6 lea eax, [eax + 32] + lea edx, [edx + 32] jg convertloop ret } diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index c51d26ea5..009690e07 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -484,9 +484,87 @@ TEST_F(libyuvTest, TestARGBSepia) { TEST_F(libyuvTest, TestARGBColorMatrix) { SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels_c[1280][4]); // Matrix for Sepia. - static const int8 kARGBToSepia[] = { + SIMD_ALIGNED(static const int8 kRGBToSepia[]) = { + 17 / 2, 68 / 2, 35 / 2, 0, + 22 / 2, 88 / 2, 45 / 2, 0, + 24 / 2, 98 / 2, 50 / 2, 0, + 0, 0, 0, 64, // Copy alpha. + }; + memset(orig_pixels, 0, sizeof(orig_pixels)); + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &kRGBToSepia[0], 16, 1); + EXPECT_EQ(31u, dst_pixels_opt[0][0]); + EXPECT_EQ(43u, dst_pixels_opt[0][1]); + EXPECT_EQ(47u, dst_pixels_opt[0][2]); + EXPECT_EQ(128u, dst_pixels_opt[0][3]); + EXPECT_EQ(135u, dst_pixels_opt[1][0]); + EXPECT_EQ(175u, dst_pixels_opt[1][1]); + EXPECT_EQ(195u, dst_pixels_opt[1][2]); + EXPECT_EQ(0u, dst_pixels_opt[1][3]); + EXPECT_EQ(67u, dst_pixels_opt[2][0]); + EXPECT_EQ(87u, dst_pixels_opt[2][1]); + EXPECT_EQ(99u, dst_pixels_opt[2][2]); + EXPECT_EQ(255u, dst_pixels_opt[2][3]); + EXPECT_EQ(87u, dst_pixels_opt[3][0]); + EXPECT_EQ(112u, dst_pixels_opt[3][1]); + EXPECT_EQ(127u, dst_pixels_opt[3][2]); + EXPECT_EQ(224u, dst_pixels_opt[3][3]); + + for (int i = 0; i < 1280; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + MaskCpuFlags(0); + ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, + &kRGBToSepia[0], 1280, 1); + MaskCpuFlags(-1); + + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, + &kRGBToSepia[0], 1280, 1); + } + + for (int i = 0; i < 1280; ++i) { + EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); + EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); + EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); + EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); + } +} + +TEST_F(libyuvTest, TestRGBColorMatrix) { + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + + // Matrix for Sepia. + SIMD_ALIGNED(static const int8 kRGBToSepia[]) = { 17, 68, 35, 0, 22, 88, 45, 0, 24, 98, 50, 0, @@ -515,8 +593,8 @@ TEST_F(libyuvTest, TestARGBColorMatrix) { orig_pixels[3][2] = 192u; orig_pixels[3][3] = 224u; // Do 16 to test asm version. - ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 16, 1); - EXPECT_EQ(33u, orig_pixels[0][0]); + RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1); + EXPECT_EQ(31u, orig_pixels[0][0]); EXPECT_EQ(43u, orig_pixels[0][1]); EXPECT_EQ(47u, orig_pixels[0][2]); EXPECT_EQ(128u, orig_pixels[0][3]); @@ -524,12 +602,12 @@ TEST_F(libyuvTest, TestARGBColorMatrix) { EXPECT_EQ(175u, orig_pixels[1][1]); EXPECT_EQ(195u, orig_pixels[1][2]); EXPECT_EQ(0u, orig_pixels[1][3]); - EXPECT_EQ(69u, orig_pixels[2][0]); - EXPECT_EQ(89u, orig_pixels[2][1]); + EXPECT_EQ(67u, orig_pixels[2][0]); + EXPECT_EQ(87u, orig_pixels[2][1]); EXPECT_EQ(99u, orig_pixels[2][2]); EXPECT_EQ(255u, orig_pixels[2][3]); - EXPECT_EQ(88u, orig_pixels[3][0]); - EXPECT_EQ(114u, orig_pixels[3][1]); + EXPECT_EQ(87u, orig_pixels[3][0]); + EXPECT_EQ(112u, orig_pixels[3][1]); EXPECT_EQ(127u, orig_pixels[3][2]); EXPECT_EQ(224u, orig_pixels[3][3]); @@ -540,7 +618,7 @@ TEST_F(libyuvTest, TestARGBColorMatrix) { orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { - ARGBColorMatrix(&orig_pixels[0][0], 0, &kARGBToSepia[0], 0, 0, 1280, 1); + RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1); } }