ARGBToY use 8 bit precision instead of 7 bit.

Neon and GCC Intel optimized, but win32 and mips not optimized.

BUG=libyuv:842, b/141482243

Change-Id: Ia56fa85c8cc1db51f374bd0c89b56d21ec94afa7
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1825642
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Miguel Casas <mcasas@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2019-10-07 15:34:01 -07:00 committed by Commit Bot
parent e278d4617f
commit fce0fed542
15 changed files with 577 additions and 363 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1735 Version: 1737
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -227,6 +227,30 @@ int H420ToRAW(const uint8_t* src_y,
int width, int width,
int height); int height);
LIBYUV_API
int J420ToRGB24(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_rgb24,
int dst_stride_rgb24,
int width,
int height);
LIBYUV_API
int J420ToRAW(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_raw,
int dst_stride_raw,
int width,
int height);
LIBYUV_API LIBYUV_API
int I420ToRGB565(const uint8_t* src_y, int I420ToRGB565(const uint8_t* src_y,
int src_stride_y, int src_stride_y,

View File

@ -210,6 +210,15 @@ int ARGBToJ400(const uint8_t* src_argb,
int width, int width,
int height); int height);
// Convert RGBA to J400. (JPeg full range).
LIBYUV_API
int RGBAToJ400(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_yj,
int dst_stride_yj,
int width,
int height);
// Convert ARGB to I400. // Convert ARGB to I400.
LIBYUV_API LIBYUV_API
int ARGBToI400(const uint8_t* src_argb, int ARGBToI400(const uint8_t* src_argb,

View File

@ -274,6 +274,7 @@ extern "C" {
#define HAS_I210TOARGBROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3
#define HAS_I422TOAR30ROW_SSSE3 #define HAS_I422TOAR30ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3 #define HAS_SWAPUVROW_SSSE3
#endif #endif
@ -298,6 +299,7 @@ extern "C" {
#define HAS_I422TOYUY2ROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2
#define HAS_MERGEUVROW_16_AVX2 #define HAS_MERGEUVROW_16_AVX2
#define HAS_MULTIPLYROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2
#define HAS_RGBATOYJROW_AVX2
#define HAS_SWAPUVROW_AVX2 #define HAS_SWAPUVROW_AVX2
// TODO(fbarchard): Fix AVX2 version of YUV24 // TODO(fbarchard): Fix AVX2 version of YUV24
// #define HAS_NV21TOYUV24ROW_AVX2 // #define HAS_NV21TOYUV24ROW_AVX2
@ -335,6 +337,7 @@ extern "C" {
#define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVJROW_NEON
#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYJROW_NEON
#define HAS_RGBATOYJROW_NEON
#define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOUVROW_NEON
#define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOVUROW_NEON
@ -951,6 +954,9 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
@ -958,6 +964,7 @@ void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
@ -1149,6 +1156,7 @@ void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
@ -1159,6 +1167,7 @@ void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@ -1166,6 +1175,7 @@ void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1735 #define LIBYUV_VERSION 1737
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -764,6 +764,42 @@ int I420ToRAW(const uint8_t* src_y,
width, height); width, height);
} }
// Convert J420 to RGB24.
LIBYUV_API
int J420ToRGB24(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_rgb24,
int dst_stride_rgb24,
int width,
int height) {
return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_rgb24, dst_stride_rgb24,
&kYuvJPEGConstants, width, height);
}
// Convert J420 to RAW.
LIBYUV_API
int J420ToRAW(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_raw,
int dst_stride_raw,
int width,
int height) {
return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
src_stride_v, // Swap U and V
src_u, src_stride_u, dst_raw, dst_stride_raw,
&kYvuJPEGConstants, // Use Yvu matrix
width, height);
}
// Convert H420 to RGB24. // Convert H420 to RGB24.
LIBYUV_API LIBYUV_API
int H420ToRGB24(const uint8_t* src_y, int H420ToRGB24(const uint8_t* src_y,

View File

@ -2157,6 +2157,80 @@ int ARGBToJ400(const uint8_t* src_argb,
return 0; return 0;
} }
// Convert RGBA to J400.
LIBYUV_API
int RGBAToJ400(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_yj,
int dst_stride_yj,
int width,
int height) {
int y;
void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) =
RGBAToYJRow_C;
if (!src_rgba || !dst_yj || width <= 0 || height == 0) {
return -1;
}
if (height < 0) {
height = -height;
src_rgba = src_rgba + (height - 1) * src_stride_rgba;
src_stride_rgba = -src_stride_rgba;
}
// Coalesce rows.
if (src_stride_rgba == width * 4 && dst_stride_yj == width) {
width *= height;
height = 1;
src_stride_rgba = dst_stride_yj = 0;
}
#if defined(HAS_RGBATOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGBAToYJRow = RGBAToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGBAToYJRow = RGBAToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_RGBATOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
RGBAToYJRow = RGBAToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
RGBAToYJRow = RGBAToYJRow_AVX2;
}
}
#endif
#if defined(HAS_RGBATOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGBAToYJRow = RGBAToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGBAToYJRow = RGBAToYJRow_NEON;
}
}
#endif
#if defined(HAS_RGBATOYJROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
RGBAToYJRow = RGBAToYJRow_Any_MSA;
if (IS_ALIGNED(width, 16)) {
RGBAToYJRow = RGBAToYJRow_MSA;
}
}
#endif
#if defined(HAS_RGBATOYJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGBAToYJRow = RGBAToYJRow_Any_MMI;
if (IS_ALIGNED(width, 8)) {
RGBAToYJRow = RGBAToYJRow_MMI;
}
}
#endif
for (y = 0; y < height; ++y) {
RGBAToYJRow(src_rgba, dst_yj, width);
src_rgba += src_stride_rgba;
dst_yj += dst_stride_yj;
}
return 0;
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -616,6 +616,9 @@ ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
#ifdef HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2
ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
#endif #endif
#ifdef HAS_RGBATOYJROW_AVX2
ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31)
#endif
#ifdef HAS_UYVYTOYROW_AVX2 #ifdef HAS_UYVYTOYROW_AVX2
ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31)
#endif #endif
@ -635,6 +638,9 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
#ifdef HAS_ARGBTOYJROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_RGBATOYJROW_SSSE3
ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
#endif
#ifdef HAS_ARGBTOYROW_NEON #ifdef HAS_ARGBTOYROW_NEON
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
#endif #endif
@ -647,6 +653,9 @@ ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7)
#ifdef HAS_ARGBTOYJROW_NEON #ifdef HAS_ARGBTOYJROW_NEON
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
#endif #endif
#ifdef HAS_RGBATOYJROW_NEON
ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
#endif
#ifdef HAS_ARGBTOYJROW_MSA #ifdef HAS_ARGBTOYJROW_MSA
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
#endif #endif

View File

@ -20,6 +20,18 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
// The following ifdef from row_win makes the C code match the row_win code,
// which is 7 bit fixed point.
#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \
(defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__)))
#define LIBYUV_RGB7 1
#endif
// mips use 7 bit RGBToY
#if (!defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A)) || \
(!defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa))
#define LIBYUV_RGB7 1
#endif
// llvm x86 is poor at ternary operator, so use branchless min/max. // llvm x86 is poor at ternary operator, so use branchless min/max.
#define USE_BRANCHLESS 1 #define USE_BRANCHLESS 1
@ -381,9 +393,22 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
} }
} }
#ifdef LIBYUV_RGB7
// Old 7 bit math for compatibility on unsupported platforms.
static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return ((33 * r + 65 * g + 13 * b) >> 7) + 16;
}
#else
// 8 bit
// Intel SSE/AVX uses the following equivalent formula
// 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round.
// return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) +
// 0x7e80) >> 8;
static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
} }
#endif
static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
@ -448,14 +473,14 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// b 0.1016 * 255 = 25.908 = 25 // b 0.1016 * 255 = 25.908 = 25
// g 0.5078 * 255 = 129.489 = 129 // g 0.5078 * 255 = 129.489 = 129
// r 0.2578 * 255 = 65.739 = 66 // r 0.2578 * 255 = 65.739 = 66
// JPeg 8 bit Y (not used): // JPeg 7 bit Y (deprecated)
// b 0.11400 * 256 = 29.184 = 29
// g 0.58700 * 256 = 150.272 = 150
// r 0.29900 * 256 = 76.544 = 77
// JPeg 7 bit Y:
// b 0.11400 * 128 = 14.592 = 15 // b 0.11400 * 128 = 14.592 = 15
// g 0.58700 * 128 = 75.136 = 75 // g 0.58700 * 128 = 75.136 = 75
// r 0.29900 * 128 = 38.272 = 38 // r 0.29900 * 128 = 38.272 = 38
// JPeg 8 bit Y:
// b 0.11400 * 256 = 29.184 = 29
// g 0.58700 * 256 = 150.272 = 150
// r 0.29900 * 256 = 76.544 = 77
// JPeg 8 bit U: // JPeg 8 bit U:
// b 0.50000 * 255 = 127.5 = 127 // b 0.50000 * 255 = 127.5 = 127
// g -0.33126 * 255 = -84.4713 = -84 // g -0.33126 * 255 = -84.4713 = -84
@ -465,9 +490,17 @@ MAKEROWY(RAW, 0, 1, 2, 3)
// g -0.41869 * 255 = -106.76595 = -107 // g -0.41869 * 255 = -106.76595 = -107
// r 0.50000 * 255 = 127.5 = 127 // r 0.50000 * 255 = 127.5 = 127
#ifdef LIBYUV_RGB7
// Old 7 bit math for compatibility on unsupported platforms.
static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
return (38 * r + 75 * g + 15 * b + 64) >> 7; return (38 * r + 75 * g + 15 * b + 64) >> 7;
} }
#else
// 8 bit
static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
return (77 * r + 150 * g + 29 * b + 128) >> 8;
}
#endif
static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
@ -516,6 +549,7 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
} }
MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(ARGB, 2, 1, 0, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4)
#undef MAKEROWYJ #undef MAKEROWYJ
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {

View File

@ -22,12 +22,15 @@ extern "C" {
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB // Constants for ARGB
static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
13, 65, 33, 0, 13, 65, 33, 0}; 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
// JPeg full range. // JPeg full range.
static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
15, 75, 38, 0, 15, 75, 38, 0}; 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
@ -45,8 +48,8 @@ static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
-20, -107, 127, 0, -20, -107, 127, 0}; -20, -107, 127, 0, -20, -107, 127, 0};
// Constants for BGRA // Constants for BGRA
static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
0, 33, 65, 13, 0, 33, 65, 13}; 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
0, -38, -74, 112, 0, -38, -74, 112}; 0, -38, -74, 112, 0, -38, -74, 112};
@ -55,8 +58,8 @@ static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
0, 112, -94, -18, 0, 112, -94, -18}; 0, 112, -94, -18, 0, 112, -94, -18};
// Constants for ABGR // Constants for ABGR
static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
33, 65, 13, 0, 33, 65, 13, 0}; 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
-38, -74, 112, 0, -38, -74, 112, 0}; -38, -74, 112, 0, -38, -74, 112, 0};
@ -65,8 +68,8 @@ static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
112, -94, -18, 0, 112, -94, -18, 0}; 112, -94, -18, 0, 112, -94, -18, 0};
// Constants for RGBA. // Constants for RGBA.
static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
0, 13, 65, 33, 0, 13, 65, 33}; 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
0, 112, -74, -38, 0, 112, -74, -38}; 0, 112, -74, -38, 0, 112, -74, -38};
@ -74,17 +77,15 @@ static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
0, -18, -94, 112, 0, -18, -94, 112}; 0, -18, -94, 112, 0, -18, -94, 112};
static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
// 7 bit fixed point 0.5.
static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u}; 0x8080u, 0x8080u, 0x8080u, 0x8080u};
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
#ifdef HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_RGB24TOARGBROW_SSSE3
@ -1034,82 +1035,126 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
} }
#endif #endif
// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
// round parameter is register containing value to add before shift.
#define RGBTOY(round) \
"1: \n" \
"movdqu (%0),%%xmm0 \n" \
"movdqu 0x10(%0),%%xmm1 \n" \
"movdqu 0x20(%0),%%xmm2 \n" \
"movdqu 0x30(%0),%%xmm3 \n" \
"psubb %%xmm5,%%xmm0 \n" \
"psubb %%xmm5,%%xmm1 \n" \
"psubb %%xmm5,%%xmm2 \n" \
"psubb %%xmm5,%%xmm3 \n" \
"movdqu %%xmm4,%%xmm6 \n" \
"pmaddubsw %%xmm0,%%xmm6 \n" \
"movdqu %%xmm4,%%xmm0 \n" \
"pmaddubsw %%xmm1,%%xmm0 \n" \
"movdqu %%xmm4,%%xmm1 \n" \
"pmaddubsw %%xmm2,%%xmm1 \n" \
"movdqu %%xmm4,%%xmm2 \n" \
"pmaddubsw %%xmm3,%%xmm2 \n" \
"lea 0x40(%0),%0 \n" \
"phaddw %%xmm0,%%xmm6 \n" \
"phaddw %%xmm2,%%xmm1 \n" \
"paddw %%" #round ",%%xmm6 \n" \
"paddw %%" #round ",%%xmm1 \n" \
"psrlw $0x8,%%xmm6 \n" \
"psrlw $0x8,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm6 \n" \
"movdqu %%xmm6,(%1) \n" \
"lea 0x10(%1),%1 \n" \
"sub $0x10,%2 \n" \
"jg 1b \n"
#define RGBTOY_AVX2(round) \
"1: \n" \
"vmovdqu (%0),%%ymm0 \n" \
"vmovdqu 0x20(%0),%%ymm1 \n" \
"vmovdqu 0x40(%0),%%ymm2 \n" \
"vmovdqu 0x60(%0),%%ymm3 \n" \
"vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \
"vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \
"vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \
"vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \
"vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
"vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
"vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
"vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
"lea 0x80(%0),%0 \n" \
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
"vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
"vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \
"vmovdqu %%ymm0,(%1) \n" \
"lea 0x20(%1),%1 \n" \
"sub $0x20,%2 \n" \
"jg 1b \n" \
"vzeroupper \n"
#ifdef HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 Y values. // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY(xmm7)
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kARGBToY), // %3 : "m"(kARGBToY), // %3
"m"(kAddY16) // %4 "m"(kSub128), // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); "m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
} }
#endif // HAS_ARGBTOYROW_SSSE3 #endif // HAS_ARGBTOYROW_SSSE3
#ifdef HAS_ARGBTOYJROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16, but do rounding. // Same as ARGBToYRow but different coefficients, no add 16.
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY(xmm5)
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"paddw %%xmm5,%%xmm0 \n"
"paddw %%xmm5,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kARGBToYJ), // %3 : "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4 "m"(kSub128) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
} }
#endif // HAS_ARGBTOYJROW_SSSE3 #endif // HAS_ARGBTOYJROW_SSSE3
#ifdef HAS_RGBATOYJROW_SSSE3
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
// Same as ARGBToYRow but different coefficients, no add 16.
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
LABELALIGN
RGBTOY(xmm5)
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToYJ), // %3
"m"(kSub128) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif // HAS_RGBATOYJROW_SSSE3
#ifdef HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYROW_AVX2
// vpermd for vphaddw + vpackuswb vpermd. // vpermd for vphaddw + vpackuswb vpermd.
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
@ -1119,38 +1164,19 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n" "vbroadcastf128 %5,%%ymm7 \n"
"vmovdqu %6,%%ymm6 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY_AVX2(ymm7)
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kARGBToY), // %3 : "m"(kARGBToY), // %3
"m"(kAddY16), // %4 "m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5 "m"(kAddY16), // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); "m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
} }
#endif // HAS_ARGBTOYROW_AVX2 #endif // HAS_ARGBTOYROW_AVX2
@ -1160,42 +1186,22 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n" "vbroadcastf128 %5,%%ymm7 \n"
"vmovdqu %6,%%ymm6 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY_AVX2(ymm7)
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kABGRToY), // %3 : "m"(kABGRToY), // %3
"m"(kAddY16), // %4 "m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5 "m"(kAddY16), // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); "m"(kPermdARGBToY_AVX) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
} }
#endif // HAS_ABGRTOYROW_AVX2 #endif // HAS_ABGRTOYROW_AVX2
#ifdef HAS_ARGBTOYJROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values. // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -1205,38 +1211,37 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vmovdqu %5,%%ymm6 \n" "vmovdqu %5,%%ymm6 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY_AVX2(ymm5)
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
"vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
"vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
"vpsrlw $0x7,%%ymm0,%%ymm0 \n"
"vpsrlw $0x7,%%ymm2,%%ymm2 \n"
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kARGBToYJ), // %3 : "m"(kARGBToYJ), // %3
"m"(kAddYJ64), // %4 "m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
#endif // HAS_ARGBTOYJROW_AVX2
#ifdef HAS_RGBATOYJROW_AVX2
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vbroadcastf128 %4,%%ymm5 \n"
"vmovdqu %5,%%ymm6 \n"
LABELALIGN
RGBTOY_AVX2(ymm5)
"vzeroupper \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "m"(kRGBAToYJ), // %3
"m"(kSub128), // %4
"m"(kPermdARGBToY_AVX) // %5 "m"(kPermdARGBToY_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
} }
#endif // HAS_ARGBTOYJROW_AVX2 #endif // HAS_RGBATOYJROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVRow_SSSE3(const uint8_t* src_argb0, void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
@ -1488,7 +1493,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+rm"(width) // %3 "+rm"(width) // %3
: "r"((intptr_t)(src_stride_argb)), // %4 : "r"((intptr_t)(src_stride_argb)), // %4
"m"(kAddUVJ128), // %5 "m"(kSub128), // %5
"m"(kARGBToVJ), // %6 "m"(kARGBToVJ), // %6
"m"(kARGBToUJ), // %7 "m"(kARGBToUJ), // %7
"m"(kShufARGBToUV_AVX) // %8 "m"(kShufARGBToUV_AVX) // %8
@ -1558,7 +1563,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
: "r"((intptr_t)(src_stride_argb)), // %4 : "r"((intptr_t)(src_stride_argb)), // %4
"m"(kARGBToVJ), // %5 "m"(kARGBToVJ), // %5
"m"(kARGBToUJ), // %6 "m"(kARGBToUJ), // %6
"m"(kAddUVJ128) // %7 "m"(kSub128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
} }
#endif // HAS_ARGBTOUVJROW_SSSE3 #endif // HAS_ARGBTOUVJROW_SSSE3
@ -1623,36 +1628,19 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY(xmm7)
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_bgra), // %0 : "+r"(src_bgra), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kBGRAToY), // %3 : "m"(kBGRAToY), // %3
"m"(kAddY16) // %4 "m"(kSub128), // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); "m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
} }
void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@ -1720,70 +1708,36 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY(xmm7)
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_abgr), // %0 : "+r"(src_abgr), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kABGRToY), // %3 : "m"(kABGRToY), // %3
"m"(kAddY16) // %4 "m"(kSub128), // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); "m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
} }
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movdqa %4,%%xmm5 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqa %5,%%xmm7 \n"
LABELALIGN LABELALIGN
"1: \n" RGBTOY(xmm7)
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x20(%0),%%xmm2 \n"
"movdqu 0x30(%0),%%xmm3 \n"
"pmaddubsw %%xmm4,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm2 \n"
"pmaddubsw %%xmm4,%%xmm3 \n"
"lea 0x40(%0),%0 \n"
"phaddw %%xmm1,%%xmm0 \n"
"phaddw %%xmm3,%%xmm2 \n"
"psrlw $0x7,%%xmm0 \n"
"psrlw $0x7,%%xmm2 \n"
"packuswb %%xmm2,%%xmm0 \n"
"paddb %%xmm5,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_rgba), // %0 : "+r"(src_rgba), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kRGBAToY), // %3 : "m"(kRGBAToY), // %3
"m"(kAddY16) // %4 "m"(kSub128), // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); "m"(kAddY16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
} }
void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
@ -4399,7 +4353,7 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n" "sub $0x20,%2 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
@ -4439,7 +4393,7 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm1,(%1) \n"
"vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x20,%3 \n" "sub $0x20,%3 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
@ -5009,12 +4963,16 @@ void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
"pmaddubsw %%xmm4,%%xmm0 \n" "psubb %%xmm5,%%xmm0 \n"
"pmaddubsw %%xmm4,%%xmm1 \n" "psubb %%xmm5,%%xmm1 \n"
"phaddw %%xmm1,%%xmm0 \n" "movdqu %%xmm4,%%xmm6 \n"
"paddw %%xmm5,%%xmm0 \n" "pmaddubsw %%xmm0,%%xmm6 \n"
"psrlw $0x7,%%xmm0 \n" "movdqu %%xmm4,%%xmm0 \n"
"packuswb %%xmm0,%%xmm0 \n" "pmaddubsw %%xmm1,%%xmm0 \n"
"phaddw %%xmm0,%%xmm6 \n"
"paddw %%xmm5,%%xmm6 \n"
"psrlw $0x8,%%xmm6 \n"
"packuswb %%xmm6,%%xmm6 \n"
"movdqu (%0),%%xmm2 \n" "movdqu (%0),%%xmm2 \n"
"movdqu 0x10(%0),%%xmm3 \n" "movdqu 0x10(%0),%%xmm3 \n"
"lea 0x20(%0),%0 \n" "lea 0x20(%0),%0 \n"
@ -5022,13 +4980,13 @@ void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
"psrld $0x18,%%xmm3 \n" "psrld $0x18,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n" "packuswb %%xmm3,%%xmm2 \n"
"packuswb %%xmm2,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n"
"movdqa %%xmm0,%%xmm3 \n" "movdqa %%xmm6,%%xmm3 \n"
"punpcklbw %%xmm0,%%xmm0 \n" "punpcklbw %%xmm6,%%xmm6 \n"
"punpcklbw %%xmm2,%%xmm3 \n" "punpcklbw %%xmm2,%%xmm3 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm6,%%xmm1 \n"
"punpcklwd %%xmm3,%%xmm0 \n" "punpcklwd %%xmm3,%%xmm6 \n"
"punpckhwd %%xmm3,%%xmm1 \n" "punpckhwd %%xmm3,%%xmm1 \n"
"movdqu %%xmm0,(%1) \n" "movdqu %%xmm6,(%1) \n"
"movdqu %%xmm1,0x10(%1) \n" "movdqu %%xmm1,0x10(%1) \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x8,%2 \n" "sub $0x8,%2 \n"
@ -5037,8 +4995,8 @@ void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kARGBToYJ), // %3 : "m"(kARGBToYJ), // %3
"m"(kAddYJ64) // %4 "m"(kSub128) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
} }
#endif // HAS_ARGBGRAYROW_SSSE3 #endif // HAS_ARGBGRAYROW_SSSE3

View File

@ -1205,9 +1205,9 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -1215,7 +1215,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1246,16 +1246,37 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2", "q12", "q13");
}
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d1, d24 \n" // B
"vmlal.u8 q2, d2, d25 \n" // G
"vmlal.u8 q2, d3, d26 \n" // R
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -1838,9 +1859,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@ -1849,7 +1870,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1864,9 +1885,9 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile( asm volatile(
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@ -1875,7 +1896,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1890,9 +1911,9 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile( asm volatile(
"vmov.u8 d24, #13 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #65 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #33 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d27, #16 \n" // Add 16 constant "vmov.u8 d27, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@ -1901,7 +1922,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n" "vqadd.u8 d0, d27 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1914,9 +1935,9 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
@ -1924,7 +1945,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"vmull.u8 q8, d1, d4 \n" // R "vmull.u8 q8, d1, d4 \n" // R
"vmlal.u8 q8, d2, d5 \n" // G "vmlal.u8 q8, d2, d5 \n" // G
"vmlal.u8 q8, d3, d6 \n" // B "vmlal.u8 q8, d3, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1937,9 +1958,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
@ -1947,7 +1968,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"vmull.u8 q8, d0, d4 \n" // R "vmull.u8 q8, d0, d4 \n" // R
"vmlal.u8 q8, d1, d5 \n" // G "vmlal.u8 q8, d1, d5 \n" // G
"vmlal.u8 q8, d2, d6 \n" // B "vmlal.u8 q8, d2, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1960,9 +1981,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
@ -1970,7 +1991,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"vmull.u8 q8, d1, d4 \n" // B "vmull.u8 q8, d1, d4 \n" // B
"vmlal.u8 q8, d2, d5 \n" // G "vmlal.u8 q8, d2, d5 \n" // G
"vmlal.u8 q8, d3, d6 \n" // R "vmlal.u8 q8, d3, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -1983,9 +2004,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #13 \n" // B * 0.1016 coefficient "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
@ -1993,7 +2014,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
"vmull.u8 q8, d0, d4 \n" // B "vmull.u8 q8, d0, d4 \n" // B
"vmlal.u8 q8, d1, d5 \n" // G "vmlal.u8 q8, d1, d5 \n" // G
"vmlal.u8 q8, d2, d6 \n" // R "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -2006,9 +2027,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"vmov.u8 d4, #33 \n" // R * 0.2578 coefficient "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d5, #65 \n" // G * 0.5078 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d6, #13 \n" // B * 0.1016 coefficient "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
"vmov.u8 d7, #16 \n" // Add 16 constant "vmov.u8 d7, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
@ -2016,7 +2037,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
"vmull.u8 q8, d0, d4 \n" // B "vmull.u8 q8, d0, d4 \n" // B
"vmlal.u8 q8, d1, d5 \n" // G "vmlal.u8 q8, d1, d5 \n" // G
"vmlal.u8 q8, d2, d6 \n" // R "vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n" "vqadd.u8 d0, d7 \n"
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n" "bgt 1b \n"
@ -2251,19 +2272,19 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB. // Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile( asm volatile(
"vmov.u8 d24, #15 \n" // B * 0.11400 coefficient "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #75 \n" // G * 0.58700 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B "vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R "vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
"vmov d1, d0 \n" // G "vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R "vmov d2, d0 \n" // R
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.

View File

@ -1262,9 +1262,9 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
@ -1272,7 +1272,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -1288,7 +1288,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
// pixels // pixels
"subs %w2, %w2, #16 \n" // 16 processed per loop "subs %w2, %w2, #16 \n" // 16 processed per loop
"st1 {v3.16b}, [%1], #16 \n" // store 16 A's. "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
@ -1303,16 +1303,16 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v4.8b, #29 \n" // B * 0.1140 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient
"movi v6.8b, #38 \n" // R * 0.29900 coefficient "movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
@ -1322,6 +1322,27 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
} }
void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
asm volatile(
"movi v4.8b, #29 \n" // B * 0.1140 coefficient
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
"subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v0.8h, v1.8b, v4.8b \n" // B
"umlal v0.8h, v2.8b, v5.8b \n" // G
"umlal v0.8h, v3.8b, v6.8b \n" // R
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
"st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
}
// 8x1 pixels. // 8x1 pixels.
void ARGBToUV444Row_NEON(const uint8_t* src_argb, void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
@ -1868,9 +1889,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v24.8b, #13 \n" // B * 0.1016 coefficient "movi v24.8b, #25 \n" // B * 0.1016 coefficient
"movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v25.8b, #129 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v26.8b, #66 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant "movi v27.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
@ -1879,7 +1900,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
"umull v3.8h, v0.8b, v24.8b \n" // B "umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v1.8b, v25.8b \n" // G
"umlal v3.8h, v2.8b, v26.8b \n" // R "umlal v3.8h, v2.8b, v26.8b \n" // R
"sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v27.8b \n" "uqadd v0.8b, v0.8b, v27.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -1895,9 +1916,9 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile( asm volatile(
"movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
@ -1906,7 +1927,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"umull v3.8h, v0.8b, v4.8b \n" // B "umull v3.8h, v0.8b, v4.8b \n" // B
"umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v1.8b, v5.8b \n" // G
"umlal v3.8h, v2.8b, v6.8b \n" // R "umlal v3.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -1921,9 +1942,9 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile( asm volatile(
"movi v24.8b, #13 \n" // B * 0.1016 coefficient "movi v24.8b, #25 \n" // B * 0.1016 coefficient
"movi v25.8b, #65 \n" // G * 0.5078 coefficient "movi v25.8b, #129 \n" // G * 0.5078 coefficient
"movi v26.8b, #33 \n" // R * 0.2578 coefficient "movi v26.8b, #66 \n" // R * 0.2578 coefficient
"movi v27.8b, #16 \n" // Add 16 constant "movi v27.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
@ -1932,7 +1953,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"umull v3.8h, v0.8b, v24.8b \n" // B "umull v3.8h, v0.8b, v24.8b \n" // B
"umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v1.8b, v25.8b \n" // G
"umlal v3.8h, v2.8b, v26.8b \n" // R "umlal v3.8h, v2.8b, v26.8b \n" // R
"sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v27.8b \n" "uqadd v0.8b, v0.8b, v27.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -1945,9 +1966,9 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v4.8b, #66 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v6.8b, #25 \n" // B * 0.1016 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
@ -1955,7 +1976,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
"umull v16.8h, v1.8b, v4.8b \n" // R "umull v16.8h, v1.8b, v4.8b \n" // R
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // B "umlal v16.8h, v3.8b, v6.8b \n" // B
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -1968,9 +1989,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
@ -1978,7 +1999,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
"umull v16.8h, v0.8b, v4.8b \n" // R "umull v16.8h, v0.8b, v4.8b \n" // R
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // B "umlal v16.8h, v2.8b, v6.8b \n" // B
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -1991,9 +2012,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
@ -2001,7 +2022,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
"umull v16.8h, v1.8b, v4.8b \n" // B "umull v16.8h, v1.8b, v4.8b \n" // B
"umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v5.8b \n" // G
"umlal v16.8h, v3.8b, v6.8b \n" // R "umlal v16.8h, v3.8b, v6.8b \n" // R
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -2014,9 +2035,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
@ -2024,7 +2045,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -2037,9 +2058,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
asm volatile( asm volatile(
"movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v6.8b, #25 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient
"movi v6.8b, #13 \n" // B * 0.1016 coefficient "movi v4.8b, #66 \n" // R * 0.2578 coefficient
"movi v7.8b, #16 \n" // Add 16 constant "movi v7.8b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
@ -2047,7 +2068,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
"umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v1.8b, v5.8b \n" // G
"umlal v16.8h, v2.8b, v6.8b \n" // R "umlal v16.8h, v2.8b, v6.8b \n" // R
"sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
"uqadd v0.8b, v0.8b, v7.8b \n" "uqadd v0.8b, v0.8b, v7.8b \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
@ -2292,19 +2313,19 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB. // Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7; // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile( asm volatile(
"movi v24.8b, #15 \n" // B * 0.11400 coefficient "movi v24.8b, #29 \n" // B * 0.1140 coefficient
"movi v25.8b, #75 \n" // G * 0.58700 coefficient "movi v25.8b, #150 \n" // G * 0.5870 coefficient
"movi v26.8b, #38 \n" // R * 0.29900 coefficient "movi v26.8b, #77 \n" // R * 0.2990 coefficient
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B "umull v4.8h, v0.8b, v24.8b \n" // B
"umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v1.8b, v25.8b \n" // G
"umlal v4.8h, v2.8b, v26.8b \n" // R "umlal v4.8h, v2.8b, v26.8b \n" // R
"sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
"orr v1.8b, v0.8b, v0.8b \n" // G "orr v1.8b, v0.8b, v0.8b \n" // G
"orr v2.8b, v0.8b, v0.8b \n" // R "orr v2.8b, v0.8b, v0.8b \n" // R
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.

View File

@ -27,13 +27,13 @@ namespace libyuv {
#define ERROR_G 1 #define ERROR_G 1
#define ERROR_B 3 #define ERROR_B 3
#define ERROR_FULL 6 #define ERROR_FULL 6
#define ERROR_J420 5 #define ERROR_J420 6
#else #else
#define ERROR_R 1 #define ERROR_R 1
#define ERROR_G 1 #define ERROR_G 1
#define ERROR_B 3 #define ERROR_B 3
#define ERROR_FULL 5 #define ERROR_FULL 5
#define ERROR_J420 3 #define ERROR_J420 4
#endif #endif
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \ #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \

View File

@ -39,7 +39,8 @@
#define ARM_YUV_ERROR 0 #define ARM_YUV_ERROR 0
#endif #endif
// Some functions fail on big endian. Enable these tests on all cpus except PowerPC // Some functions fail on big endian. Enable these tests on all cpus except
// PowerPC
#if !defined(__powerpc__) #if !defined(__powerpc__)
#define LITTLE_ENDIAN_TEST 1 #define LITTLE_ENDIAN_TEST 1
#endif #endif
@ -684,6 +685,8 @@ TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1)
TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1)
TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
#ifdef LITTLE_ENDIAN_TEST #ifdef LITTLE_ENDIAN_TEST
@ -1209,8 +1212,9 @@ TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2) TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 0)
TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2) TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 0)
TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1, 0)
TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0) TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0) TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
#ifdef LITTLE_ENDIAN_TEST #ifdef LITTLE_ENDIAN_TEST

View File

@ -12,7 +12,6 @@
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#include "../unit_test/unit_test.h" #include "../unit_test/unit_test.h"
#include "libyuv/compare.h" #include "libyuv/compare.h"
#include "libyuv/convert.h" #include "libyuv/convert.h"
@ -281,6 +280,7 @@ TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {
} }
} }
// near is for legacy platforms.
TEST_F(LibYUVPlanarTest, TestARGBGray) { TEST_F(LibYUVPlanarTest, TestARGBGray) {
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
memset(orig_pixels, 0, sizeof(orig_pixels)); memset(orig_pixels, 0, sizeof(orig_pixels));
@ -317,17 +317,17 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
orig_pixels[5][3] = 224u; orig_pixels[5][3] = 224u;
// Do 16 to test asm version. // Do 16 to test asm version.
ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1); ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1);
EXPECT_EQ(30u, orig_pixels[0][0]); EXPECT_NEAR(29u, orig_pixels[0][0], 1);
EXPECT_EQ(30u, orig_pixels[0][1]); EXPECT_NEAR(29u, orig_pixels[0][1], 1);
EXPECT_EQ(30u, orig_pixels[0][2]); EXPECT_NEAR(29u, orig_pixels[0][2], 1);
EXPECT_EQ(128u, orig_pixels[0][3]); EXPECT_EQ(128u, orig_pixels[0][3]);
EXPECT_EQ(149u, orig_pixels[1][0]); EXPECT_EQ(149u, orig_pixels[1][0]);
EXPECT_EQ(149u, orig_pixels[1][1]); EXPECT_EQ(149u, orig_pixels[1][1]);
EXPECT_EQ(149u, orig_pixels[1][2]); EXPECT_EQ(149u, orig_pixels[1][2]);
EXPECT_EQ(0u, orig_pixels[1][3]); EXPECT_EQ(0u, orig_pixels[1][3]);
EXPECT_EQ(76u, orig_pixels[2][0]); EXPECT_NEAR(77u, orig_pixels[2][0], 1);
EXPECT_EQ(76u, orig_pixels[2][1]); EXPECT_NEAR(77u, orig_pixels[2][1], 1);
EXPECT_EQ(76u, orig_pixels[2][2]); EXPECT_NEAR(77u, orig_pixels[2][2], 1);
EXPECT_EQ(255u, orig_pixels[2][3]); EXPECT_EQ(255u, orig_pixels[2][3]);
EXPECT_EQ(0u, orig_pixels[3][0]); EXPECT_EQ(0u, orig_pixels[3][0]);
EXPECT_EQ(0u, orig_pixels[3][1]); EXPECT_EQ(0u, orig_pixels[3][1]);
@ -337,9 +337,9 @@ TEST_F(LibYUVPlanarTest, TestARGBGray) {
EXPECT_EQ(255u, orig_pixels[4][1]); EXPECT_EQ(255u, orig_pixels[4][1]);
EXPECT_EQ(255u, orig_pixels[4][2]); EXPECT_EQ(255u, orig_pixels[4][2]);
EXPECT_EQ(255u, orig_pixels[4][3]); EXPECT_EQ(255u, orig_pixels[4][3]);
EXPECT_EQ(96u, orig_pixels[5][0]); EXPECT_NEAR(97u, orig_pixels[5][0], 1);
EXPECT_EQ(96u, orig_pixels[5][1]); EXPECT_NEAR(97u, orig_pixels[5][1], 1);
EXPECT_EQ(96u, orig_pixels[5][2]); EXPECT_NEAR(97u, orig_pixels[5][2], 1);
EXPECT_EQ(224u, orig_pixels[5][3]); EXPECT_EQ(224u, orig_pixels[5][3]);
for (int i = 0; i < 1280; ++i) { for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i; orig_pixels[i][0] = i;
@ -389,30 +389,30 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
orig_pixels[5][3] = 224u; orig_pixels[5][3] = 224u;
// Do 16 to test asm version. // Do 16 to test asm version.
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1); ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1);
EXPECT_EQ(30u, gray_pixels[0][0]); EXPECT_NEAR(30u, gray_pixels[0][0], 1);
EXPECT_EQ(30u, gray_pixels[0][1]); EXPECT_NEAR(30u, gray_pixels[0][1], 1);
EXPECT_EQ(30u, gray_pixels[0][2]); EXPECT_NEAR(30u, gray_pixels[0][2], 1);
EXPECT_EQ(128u, gray_pixels[0][3]); EXPECT_NEAR(128u, gray_pixels[0][3], 1);
EXPECT_EQ(149u, gray_pixels[1][0]); EXPECT_NEAR(149u, gray_pixels[1][0], 1);
EXPECT_EQ(149u, gray_pixels[1][1]); EXPECT_NEAR(149u, gray_pixels[1][1], 1);
EXPECT_EQ(149u, gray_pixels[1][2]); EXPECT_NEAR(149u, gray_pixels[1][2], 1);
EXPECT_EQ(0u, gray_pixels[1][3]); EXPECT_NEAR(0u, gray_pixels[1][3], 1);
EXPECT_EQ(76u, gray_pixels[2][0]); EXPECT_NEAR(76u, gray_pixels[2][0], 1);
EXPECT_EQ(76u, gray_pixels[2][1]); EXPECT_NEAR(76u, gray_pixels[2][1], 1);
EXPECT_EQ(76u, gray_pixels[2][2]); EXPECT_NEAR(76u, gray_pixels[2][2], 1);
EXPECT_EQ(255u, gray_pixels[2][3]); EXPECT_NEAR(255u, gray_pixels[2][3], 1);
EXPECT_EQ(0u, gray_pixels[3][0]); EXPECT_NEAR(0u, gray_pixels[3][0], 1);
EXPECT_EQ(0u, gray_pixels[3][1]); EXPECT_NEAR(0u, gray_pixels[3][1], 1);
EXPECT_EQ(0u, gray_pixels[3][2]); EXPECT_NEAR(0u, gray_pixels[3][2], 1);
EXPECT_EQ(255u, gray_pixels[3][3]); EXPECT_NEAR(255u, gray_pixels[3][3], 1);
EXPECT_EQ(255u, gray_pixels[4][0]); EXPECT_NEAR(255u, gray_pixels[4][0], 1);
EXPECT_EQ(255u, gray_pixels[4][1]); EXPECT_NEAR(255u, gray_pixels[4][1], 1);
EXPECT_EQ(255u, gray_pixels[4][2]); EXPECT_NEAR(255u, gray_pixels[4][2], 1);
EXPECT_EQ(255u, gray_pixels[4][3]); EXPECT_NEAR(255u, gray_pixels[4][3], 1);
EXPECT_EQ(96u, gray_pixels[5][0]); EXPECT_NEAR(96u, gray_pixels[5][0], 1);
EXPECT_EQ(96u, gray_pixels[5][1]); EXPECT_NEAR(96u, gray_pixels[5][1], 1);
EXPECT_EQ(96u, gray_pixels[5][2]); EXPECT_NEAR(96u, gray_pixels[5][2], 1);
EXPECT_EQ(224u, gray_pixels[5][3]); EXPECT_NEAR(224u, gray_pixels[5][3], 1);
for (int i = 0; i < 1280; ++i) { for (int i = 0; i < 1280; ++i) {
orig_pixels[i][0] = i; orig_pixels[i][0] = i;
orig_pixels[i][1] = i / 2; orig_pixels[i][1] = i / 2;
@ -422,6 +422,20 @@ TEST_F(LibYUVPlanarTest, TestARGBGrayTo) {
for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1); ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1);
} }
for (int i = 0; i < 256; ++i) {
orig_pixels[i][0] = i;
orig_pixels[i][1] = i;
orig_pixels[i][2] = i;
orig_pixels[i][3] = i;
}
ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1);
for (int i = 0; i < 256; ++i) {
EXPECT_EQ(i, orig_pixels[i][0]);
EXPECT_EQ(i, orig_pixels[i][1]);
EXPECT_EQ(i, orig_pixels[i][2]);
EXPECT_EQ(i, orig_pixels[i][3]);
}
} }
TEST_F(LibYUVPlanarTest, TestARGBSepia) { TEST_F(LibYUVPlanarTest, TestARGBSepia) {