diff --git a/README.chromium b/README.chromium index b55847b32..38c4df195 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 628 +Version: 629 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9f0b07ea3..3e460594a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -54,6 +54,7 @@ extern "C" { #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 @@ -203,6 +204,7 @@ extern "C" { #define HAS_ARGBTOUV422ROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_BGRATOUVROW_NEON @@ -423,6 +425,8 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix); void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int pix); void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, @@ -481,6 +485,8 @@ void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr, @@ -489,6 +495,8 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr, @@ -497,6 +505,8 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, @@ -511,6 +521,8 @@ void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix); void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix); +void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix); void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int pix); void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, @@ -531,6 +543,8 @@ void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, uint8* dst_u, uint8* dst_v, int pix); void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 589ade927..0630ec743 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 628 +#define LIBYUV_VERSION 629 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 48a645666..7e03bd9f1 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -994,19 +994,19 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = ARGBToYJRow_C; #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3; ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3; if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToUVJRow = ARGBToUVJRow_SSSE3; if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } @@ -1021,16 +1021,16 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, ARGBToYJRow = ARGBToYJRow_NEON; } if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; + ARGBToUVJRow = ARGBToUVJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; + ARGBToUVJRow = ARGBToUVJRow_NEON; } } } #endif for (int y = 0; y < height - 1; y += 2) { - ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); ARGBToYJRow(src_argb, dst_yj, width); ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); src_argb += src_stride_argb * 2; @@ -1039,7 +1039,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, dst_v += dst_stride_v; } if (height & 1) { - ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); + ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); ARGBToYJRow(src_argb, dst_yj, width); } return 0; diff --git a/source/row_any.cc b/source/row_any.cc index 3ccb2c0c7..07202596c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -293,6 +293,8 @@ UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) #endif #ifdef HAS_ARGBTOUVROW_SSSE3 UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C, + 4, 15) UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15) UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15) UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15) @@ -301,6 +303,7 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) +UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) diff --git a/source/row_common.cc b/source/row_common.cc index 538b04db9..58afcd87e 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -256,25 +256,44 @@ MAKEROWY(RGB24, 2, 1, 0, 3) MAKEROWY(RAW, 0, 1, 2, 3) #undef MAKEROWY -// BT.601 mpeg range +// JPeg uses a variation on BT.601-1 full range +// y = 0.29900 * r + 0.58700 * g + 0.11400 * b +// u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center +// v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center +// BT.601 Mpeg range uses: // b 0.1016 * 255 = 25.908 = 25 // g 0.5078 * 255 = 129.489 = 129 // r 0.2578 * 255 = 65.739 = 66 -// = 0.8672. 1/.8672 = 1.1531 -// BT.601 full range 8 bit (not used) -// b 0.1016 * 1.1531 = 0.1172 * 255 = 29.886 = 30 -// g 0.5078 * 1.1531 = 0.5855 * 255 = 149.3025 = 149 -// r 0.2578 * 1.1531 = 0.2973 * 255 = 75.8115 = 76 -// 30 + 149 + 76 = 255 -// BT.601 full range 7 bit -// b 0.1172 * 127 = 14.8844 = 15 -// g 0.5855 * 127 = 74.35855 = 74 -// r 0.2973 * 127 = 37.7571 = 38 +// JPeg 8 bit Y (not used): +// b 0.11400 * 256 = 29.184 = 29 +// g 0.58700 * 256 = 150.272 = 150 +// r 0.29900 * 256 = 76.544 = 77 +// JPeg 7 bit Y: +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 +// JPeg 8 bit U: +// b 0.50000 * 255 = 127.5 = 127 +// g -0.33126 * 255 = -84.4713 = -84 +// r -0.16874 * 255 = -43.0287 = -43 +// JPeg 8 bit V: +// b -0.08131 * 255 = -20.73405 = -20 +// g -0.41869 * 255 = -106.76595 = -107 +// r 0.50000 * 255 = 127.5 = 127 static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { - return (38 * r + 74 * g + 15 * b + 64) >> 7; + return (38 * r + 75 * g + 15 * b + 64) >> 7; } +static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) { + return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; +} +static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) { + return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; +} + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) + #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ for (int x = 0; x < width; ++x) { \ @@ -283,6 +302,31 @@ void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \ dst_y += 1; \ } \ } \ +void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \ + for (int x = 0; x < width - 1; x += 2) { \ + uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]); \ + dst_u[0] = RGBToUJ(ar, ag, ab); \ + dst_v[0] = RGBToVJ(ar, ag, ab); \ + } \ +} MAKEROWYJ(ARGB, 2, 1, 0, 4) #undef MAKEROWYJ @@ -537,16 +581,9 @@ void ARGBToUV411Row_C(const uint8* src_argb, } } -// http://en.wikipedia.org/wiki/Grayscale. -// 0.11 * B + 0.59 * G + 0.30 * R -// Coefficients rounded to multiple of 2 for consistency with SSSE3 version. -static __inline int RGBToGray(uint8 r, uint8 g, uint8 b) { - return (28 * b + 152 * g + 76 * r) >> 8; -} - void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) { for (int x = 0; x < width; ++x) { - uint8 y = RGBToGray(src_argb[2], src_argb[1], src_argb[0]); + uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = src_argb[3]; dst_argb += 4; diff --git a/source/row_neon.cc b/source/row_neon.cc index ae2ba9be2..bb7d32bb5 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1338,9 +1338,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #15 \n" // B * 0.1172 coefficient - "vmov.u8 d25, #74 \n" // G * 0.5855 coefficient - "vmov.u8 d26, #38 \n" // R * 0.2973 coefficient + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -1348,7 +1348,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R - "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1547,6 +1547,45 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, ); } +// TODO(fbarchard): Subsample match C code. +void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #127 / 4 \n" // UB / VR 0.500 coefficient + "vmov.s16 q11, #84 / 4 \n" // UG -0.33126 coefficient + "vmov.s16 q12, #43 / 4 \n" // UR -0.16874 coefficient + "vmov.s16 q13, #20 / 4 \n" // VB -0.08131 coefficient + "vmov.s16 q14, #107 / 4 \n" // VG -0.41869 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(src_stride_argb), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( @@ -2365,13 +2404,13 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels -// Similar to ARGBToY but different constants, no round and stores ARGB. -// C code is (28 * b + 152 * g + 76 * r) >> 8; +// Similar to ARGBToYJ but stores ARGB. +// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( - "vmov.u8 d24, #14 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #76 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #38 \n" // R * 0.2578 coefficient + "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient + "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient + "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. @@ -2379,7 +2418,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R - "vqshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "vmov d1, d0 \n" // G "vmov d2, d0 \n" // R "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. diff --git a/source/row_posix.cc b/source/row_posix.cc index 4f722c726..74ae032b4 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -37,17 +37,25 @@ CONST vec8 kARGBToY = { // JPeg full range. CONST vec8 kARGBToYJ = { - 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0 + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 }; CONST vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; +CONST vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + CONST vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; +CONST vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + // Constants for BGRA CONST vec8 kBGRAToY = { 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 @@ -100,6 +108,10 @@ CONST uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; +CONST uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; + // Shuffle table for converting RGB24 to ARGB. CONST uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u @@ -830,6 +842,69 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, ); } +// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToUJ), // %0 + "m"(kARGBToVJ), // %1 + "m"(kAddUVJ128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_argb)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -895,6 +970,72 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, ); } +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToUJ), // %0 + "m"(kARGBToVJ), // %1 + "m"(kAddUVJ128) // %2 + ); + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "sub $0x10,%3 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_argb)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -3764,15 +3905,11 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, #endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBGRAYROW_SSSE3 -// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R -CONST vec8 kARGBToGray = { - 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 -}; - // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" "sub %0,%1 \n" // 8 pixel loop. @@ -3783,6 +3920,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" + "paddw %%xmm5,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movdqa (%0),%%xmm2 \n" @@ -3805,10 +3943,11 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : "m"(kARGBToGray) // %3 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif ); } diff --git a/source/row_win.cc b/source/row_win.cc index 3d4293551..2b7c4df11 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -27,7 +27,7 @@ static const vec8 kARGBToY = { // JPeg full range. static const vec8 kARGBToYJ = { - 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0 + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 }; static const lvec8 kARGBToY_AVX = { @@ -39,6 +39,10 @@ static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; +static const vec8 kARGBToUJ = { + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 +}; + // TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version. static const lvec8 kARGBToU_AVX = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, @@ -49,6 +53,10 @@ static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; +static const vec8 kARGBToVJ = { + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 +}; + static const lvec8 kARGBToV_AVX = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 @@ -124,6 +132,10 @@ static const uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; +static const uvec16 kAddUVJ128 = { + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u +}; + static const ulvec8 kAddUV128_AVX = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, @@ -1087,6 +1099,73 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } +__declspec(naked) __declspec(align(16)) +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToUJ + movdqa xmm6, kARGBToVJ + movdqa xmm5, kAddUVJ128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + #ifdef HAS_ARGBTOUVROW_AVX2 __declspec(naked) __declspec(align(32)) void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, @@ -1223,6 +1302,77 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } } +__declspec(naked) __declspec(align(16)) +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToUJ + movdqa xmm6, kARGBToVJ + movdqa xmm5, kAddUVJ128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + paddw xmm0, xmm5 // +.5 rounding -> unsigned + paddw xmm1, xmm5 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + __declspec(naked) __declspec(align(16)) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { @@ -4597,11 +4747,6 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3 -// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R -static const vec8 kARGBToGray = { - 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 -}; - // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. __declspec(naked) __declspec(align(16)) void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { @@ -4609,7 +4754,8 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* width */ - movdqa xmm4, kARGBToGray + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 sub edx, eax align 16 @@ -4619,6 +4765,7 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm0, xmm1 + paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 G bytes movdqa xmm2, [eax] // A diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 0fa886071..ac358b4c4 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -689,7 +689,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ benchmark_width_, DIFF, _Opt, +, 0) TESTATOPLANAR(ARGB, 4, I420, 2, 2, 4) +#ifdef __arm__ TESTATOPLANAR(ARGB, 4, J420, 2, 2, 4) +#else +TESTATOPLANAR(ARGB, 4, J420, 2, 2, 0) +#endif TESTATOPLANAR(BGRA, 4, I420, 2, 2, 4) TESTATOPLANAR(ABGR, 4, I420, 2, 2, 4) TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 506d11a8c..9a217704d 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -269,7 +269,6 @@ TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { TEST_F(libyuvTest, TestARGBGray) { SIMD_ALIGNED(uint8 orig_pixels[256][4]); - // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; @@ -285,30 +284,47 @@ TEST_F(libyuvTest, TestARGBGray) { orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; + // Test black + orig_pixels[3][0] = 0u; + orig_pixels[3][1] = 0u; + orig_pixels[3][2] = 0u; + orig_pixels[3][3] = 255u; + // Test white + orig_pixels[4][0] = 255u; + orig_pixels[4][1] = 255u; + orig_pixels[4][2] = 255u; + orig_pixels[4][3] = 255u; // Test color - orig_pixels[3][0] = 16u; - orig_pixels[3][1] = 64u; - orig_pixels[3][2] = 192u; - orig_pixels[3][3] = 224u; + orig_pixels[5][0] = 16u; + orig_pixels[5][1] = 64u; + orig_pixels[5][2] = 192u; + orig_pixels[5][3] = 224u; // Do 16 to test asm version. ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1); - EXPECT_EQ(27u, orig_pixels[0][0]); - EXPECT_EQ(27u, orig_pixels[0][1]); - EXPECT_EQ(27u, orig_pixels[0][2]); + EXPECT_EQ(30u, orig_pixels[0][0]); + EXPECT_EQ(30u, orig_pixels[0][1]); + EXPECT_EQ(30u, orig_pixels[0][2]); EXPECT_EQ(128u, orig_pixels[0][3]); - EXPECT_EQ(151u, orig_pixels[1][0]); - EXPECT_EQ(151u, orig_pixels[1][1]); - EXPECT_EQ(151u, orig_pixels[1][2]); + EXPECT_EQ(149u, orig_pixels[1][0]); + EXPECT_EQ(149u, orig_pixels[1][1]); + EXPECT_EQ(149u, orig_pixels[1][2]); EXPECT_EQ(0u, orig_pixels[1][3]); - EXPECT_EQ(75u, orig_pixels[2][0]); - EXPECT_EQ(75u, orig_pixels[2][1]); - EXPECT_EQ(75u, orig_pixels[2][2]); + EXPECT_EQ(76u, orig_pixels[2][0]); + EXPECT_EQ(76u, orig_pixels[2][1]); + EXPECT_EQ(76u, orig_pixels[2][2]); EXPECT_EQ(255u, orig_pixels[2][3]); - EXPECT_EQ(96u, orig_pixels[3][0]); - EXPECT_EQ(96u, orig_pixels[3][1]); - EXPECT_EQ(96u, orig_pixels[3][2]); - EXPECT_EQ(224u, orig_pixels[3][3]); - + EXPECT_EQ(0u, orig_pixels[3][0]); + EXPECT_EQ(0u, orig_pixels[3][1]); + EXPECT_EQ(0u, orig_pixels[3][2]); + EXPECT_EQ(255u, orig_pixels[3][3]); + EXPECT_EQ(255u, orig_pixels[4][0]); + EXPECT_EQ(255u, orig_pixels[4][1]); + EXPECT_EQ(255u, orig_pixels[4][2]); + EXPECT_EQ(255u, orig_pixels[4][3]); + EXPECT_EQ(96u, orig_pixels[5][0]); + EXPECT_EQ(96u, orig_pixels[5][1]); + EXPECT_EQ(96u, orig_pixels[5][2]); + EXPECT_EQ(224u, orig_pixels[5][3]); for (int i = 0; i < 256; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; @@ -323,7 +339,6 @@ TEST_F(libyuvTest, TestARGBGray) { TEST_F(libyuvTest, TestARGBGrayTo) { SIMD_ALIGNED(uint8 orig_pixels[256][4]); SIMD_ALIGNED(uint8 gray_pixels[256][4]); - // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; @@ -339,30 +354,47 @@ TEST_F(libyuvTest, TestARGBGrayTo) { orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; + // Test black + orig_pixels[3][0] = 0u; + orig_pixels[3][1] = 0u; + orig_pixels[3][2] = 0u; + orig_pixels[3][3] = 255u; + // Test white + orig_pixels[4][0] = 255u; + orig_pixels[4][1] = 255u; + orig_pixels[4][2] = 255u; + orig_pixels[4][3] = 255u; // Test color - orig_pixels[3][0] = 16u; - orig_pixels[3][1] = 64u; - orig_pixels[3][2] = 192u; - orig_pixels[3][3] = 224u; + orig_pixels[5][0] = 16u; + orig_pixels[5][1] = 64u; + orig_pixels[5][2] = 192u; + orig_pixels[5][3] = 224u; // Do 16 to test asm version. ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1); - EXPECT_EQ(27u, gray_pixels[0][0]); - EXPECT_EQ(27u, gray_pixels[0][1]); - EXPECT_EQ(27u, gray_pixels[0][2]); + EXPECT_EQ(30u, gray_pixels[0][0]); + EXPECT_EQ(30u, gray_pixels[0][1]); + EXPECT_EQ(30u, gray_pixels[0][2]); EXPECT_EQ(128u, gray_pixels[0][3]); - EXPECT_EQ(151u, gray_pixels[1][0]); - EXPECT_EQ(151u, gray_pixels[1][1]); - EXPECT_EQ(151u, gray_pixels[1][2]); + EXPECT_EQ(149u, gray_pixels[1][0]); + EXPECT_EQ(149u, gray_pixels[1][1]); + EXPECT_EQ(149u, gray_pixels[1][2]); EXPECT_EQ(0u, gray_pixels[1][3]); - EXPECT_EQ(75u, gray_pixels[2][0]); - EXPECT_EQ(75u, gray_pixels[2][1]); - EXPECT_EQ(75u, gray_pixels[2][2]); + EXPECT_EQ(76u, gray_pixels[2][0]); + EXPECT_EQ(76u, gray_pixels[2][1]); + EXPECT_EQ(76u, gray_pixels[2][2]); EXPECT_EQ(255u, gray_pixels[2][3]); - EXPECT_EQ(96u, gray_pixels[3][0]); - EXPECT_EQ(96u, gray_pixels[3][1]); - EXPECT_EQ(96u, gray_pixels[3][2]); - EXPECT_EQ(224u, gray_pixels[3][3]); - + EXPECT_EQ(0u, gray_pixels[3][0]); + EXPECT_EQ(0u, gray_pixels[3][1]); + EXPECT_EQ(0u, gray_pixels[3][2]); + EXPECT_EQ(255u, gray_pixels[3][3]); + EXPECT_EQ(255u, gray_pixels[4][0]); + EXPECT_EQ(255u, gray_pixels[4][1]); + EXPECT_EQ(255u, gray_pixels[4][2]); + EXPECT_EQ(255u, gray_pixels[4][3]); + EXPECT_EQ(96u, gray_pixels[5][0]); + EXPECT_EQ(96u, gray_pixels[5][1]); + EXPECT_EQ(96u, gray_pixels[5][2]); + EXPECT_EQ(224u, gray_pixels[5][3]); for (int i = 0; i < 256; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2;