mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
ARGBToY for AVX512
- add ARGBToYMatrixRow_AVX512BW
- refactor SSE and AVX to use Matrix functions, making old functions
call the new ones.
Zen5 1280x720
Was AVX2 LibYUVConvertTest.ARGBToI444_Opt (1125 ms)
Now AVX512 LibYUVConvertTest.ARGBToI444_Opt (641 ms)
Details by Gemini:
1. Created 3 new Matrix functions:
Added ARGBToYMatrixRow_SSSE3, ARGBToYMatrixRow_AVX2, and
ARGBToYMatrixRow_AVX512BW to source/row_gcc.cc. These take the
const struct ArgbConstants* c parameter similarly to
ARGBToUV444MatrixRow_*. The x86 vector instructions dynamically
calculate the needed values using the properties of the constants
struct, including using vpmaddwd inside the AVX512 code to offset
the lack of a native vphaddw.
2. Replaced Old Functions with Wrappers:
Modified the existing implementations of ARGBToYRow_SSSE3,
ARGBToYJRow_SSSE3, ABGRToYRow_SSSE3, ABGRToYJRow_SSSE3,
RGBAToYRow_SSSE3, RGBAToYJRow_SSSE3, BGRAToYRow_SSSE3 (and their
_AVX2 equivalents) in source/row_gcc.cc to act as inline wrappers
calling the new ARGBToYMatrixRow_* functions, passing the right
matrix parameters (e.g. &kArgbI601Constants, &kArgbJPEGConstants,
&kAbgrI601Constants).
3. Added row_any.cc Handlers:
Added ANY11MC definitions to source/row_any.cc to autogenerate
ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_Any_AVX2, and
ARGBToYMatrixRow_Any_AVX512BW which safely handles non-aligned
tails.
4. Updated include/libyuv/row.h:
Updated the headers with the proper void declarations for all newly
generated Matrix and Any_ variants. Also defined
HAS_ARGBTOYROW_AVX512BW in the CPU macros.
5. Tested the Implementations:
Compiled and tested on Linux x86, which resulted in all tests passing
cleanly. Also successfully completed all Windows 32-bit build checks
ensuring 32-bit regression prevention without issues.
Bug: 477295731
Change-Id: I4f5eec9a961e24a9d760d0a1c0810fb5e29a0bd1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7759494
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
644251f252
commit
893eacf9b4
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1924
|
||||
Version: 1925
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -381,6 +381,7 @@ extern "C" {
|
||||
#define HAS_I422TOARGBROW_AVX512BW
|
||||
#define HAS_ARGBTOUV444ROW_AVX512BW
|
||||
#define HAS_ARGBTOUV444MATRIXROW_AVX512BW
|
||||
#define HAS_ARGBTOYROW_AVX512BW
|
||||
#define HAS_ARGBTOUVJ444ROW_AVX512BW
|
||||
#endif
|
||||
|
||||
@ -1746,19 +1747,31 @@ void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
||||
void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
||||
void ABGRToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ABGRToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
||||
void ABGRToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ABGRToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
||||
void ABGRToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width);
|
||||
void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
|
||||
void RGBAToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void RGBAToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
|
||||
void BGRAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void BGRAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
||||
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
|
||||
void RGBAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void RGBAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
|
||||
void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
|
||||
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width);
|
||||
void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
|
||||
@ -2149,6 +2162,31 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_Any_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
|
||||
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -2204,6 +2242,15 @@ void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width);
|
||||
void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
|
||||
void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
|
||||
void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
|
||||
void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void RGBAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1924
|
||||
#define LIBYUV_VERSION 1925
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -2091,6 +2091,14 @@ int ARGBToI420(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
@ -2522,6 +2530,14 @@ int BGRAToI420(const uint8_t* src_bgra,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
BGRAToYRow = BGRAToYRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
BGRAToYRow = BGRAToYRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_BGRATOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
BGRAToUVRow = BGRAToUVRow_Any_AVX2;
|
||||
@ -2621,6 +2637,14 @@ int ABGRToI420(const uint8_t* src_abgr,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ABGRToYRow = ABGRToYRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ABGRToYRow = ABGRToYRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ABGRTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
|
||||
@ -2752,6 +2776,22 @@ int RGBAToI420(const uint8_t* src_rgba,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RGBAToYRow = RGBAToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
RGBAToYRow = RGBAToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
RGBAToYRow = RGBAToYRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
RGBAToYRow = RGBAToYRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGBATOUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
|
||||
@ -3125,6 +3165,14 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToYJRow = ARGBToYJRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUVJROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
|
||||
|
||||
@ -124,6 +124,14 @@ int ARGBToI444(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
@ -1083,6 +1091,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ABGRToYRow = ABGRToYRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ABGRToYRow = ABGRToYRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ABGRTOUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
|
||||
@ -2710,6 +2726,14 @@ int ARGBToJ444(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToYJRow = ARGBToYJRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||
@ -3171,6 +3195,14 @@ int RGBAToJ400(const uint8_t* src_rgba,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
RGBAToYJRow = RGBAToYJRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
RGBAToYJRow = RGBAToYJRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGBATOYJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
RGBAToYJRow = RGBAToYJRow_Any_NEON;
|
||||
@ -3268,6 +3300,14 @@ int ABGRToJ420(const uint8_t* src_abgr,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ABGRToYJRow = ABGRToYJRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ABGRToYJRow = ABGRToYJRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ABGRTOUVJROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ABGRToUVJRow = ABGRToUVJRow_Any_AVX2;
|
||||
|
||||
@ -4777,6 +4777,14 @@ static int ARGBSobelize(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToYJRow = ARGBToYJRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||
|
||||
@ -1070,11 +1070,29 @@ ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31)
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(ARGBToYRow_Any_AVX512BW, ARGBToYRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_BGRATOYROW_SSSE3
|
||||
ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15)
|
||||
ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15)
|
||||
ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(BGRAToYRow_Any_AVX512BW, BGRAToYRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
ANY11(BGRAToYRow_Any_AVX2, BGRAToYRow_AVX2, 0, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(RGBAToYRow_Any_AVX512BW, RGBAToYRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
ANY11(RGBAToYRow_Any_AVX2, RGBAToYRow_AVX2, 0, 4, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(ABGRToYRow_Any_AVX512BW, ABGRToYRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TOYROW_SSE2
|
||||
ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15)
|
||||
ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
|
||||
@ -1082,12 +1100,21 @@ ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15)
|
||||
#ifdef HAS_ARGBTOYJROW_SSSE3
|
||||
ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(ARGBToYJRow_Any_AVX512BW, ARGBToYJRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_ABGRTOYJROW_SSSE3
|
||||
ANY11(ABGRToYJRow_Any_SSSE3, ABGRToYJRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(ABGRToYJRow_Any_AVX512BW, ABGRToYJRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_RGBATOYJROW_SSSE3
|
||||
ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11(RGBAToYJRow_Any_AVX512BW, RGBAToYJRow_AVX512BW, 0, 4, 1, 63)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_NEON
|
||||
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
|
||||
#endif
|
||||
@ -2282,6 +2309,15 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
|
||||
memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYMATRIXROW_NEON
|
||||
ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
|
||||
#endif
|
||||
|
||||
@ -24,37 +24,21 @@ extern "C" {
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
||||
|
||||
// Constants for ARGB
|
||||
static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u,
|
||||
25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u};
|
||||
|
||||
// JPeg full range.
|
||||
static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u,
|
||||
29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u};
|
||||
|
||||
static const uvec8 kABGRToYJ = {77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u,
|
||||
77u, 150u, 29u, 0u, 77u, 150u, 29u, 0u};
|
||||
|
||||
static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u,
|
||||
0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u};
|
||||
#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
|
||||
// Constants for BGRA
|
||||
static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u,
|
||||
0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u};
|
||||
|
||||
// Constants for ABGR
|
||||
static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u,
|
||||
66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u};
|
||||
|
||||
// Constants for RGBA.
|
||||
static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u,
|
||||
0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u};
|
||||
// 126 (7e) - (-109..110) = 16..235
|
||||
static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
|
||||
0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u};
|
||||
static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u};
|
||||
|
||||
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u};
|
||||
@ -1367,22 +1351,9 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
|
||||
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7) //
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kARGBToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
@ -1390,198 +1361,203 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
|
||||
// Same as ARGBToYRow but different coefficients, no add 16.
|
||||
void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7) //
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kARGBToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY0) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kArgbJPEGConstants);
|
||||
}
|
||||
#endif // HAS_ARGBTOYJROW_SSSE3
|
||||
|
||||
#ifdef HAS_ABGRTOYJROW_SSSE3
|
||||
// Convert 16 ABGR pixels (64 bytes) to 16 YJ values.
|
||||
// Same as ABGRToYRow but different coefficients, no add 16.
|
||||
void ABGRToYJRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7) //
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kABGRToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY0) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
void ABGRToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
#endif // HAS_ABGRTOYJROW_SSSE3
|
||||
|
||||
#ifdef HAS_RGBATOYJROW_SSSE3
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
|
||||
// Same as ARGBToYRow but different coefficients, no add 16.
|
||||
void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7) //
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kRGBAToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY0) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
void RGBAToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
#endif // HAS_RGBATOYJROW_SSSE3
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ABGRTOYROW_AVX2) || \
|
||||
defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
|
||||
// vpermd for vphaddw + vpackuswb vpermd.
|
||||
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
|
||||
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm4 \n"
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqa %6,%%ymm6 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY_AVX2(ymm7) //
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kARGBToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_ARGBTOYROW_AVX2
|
||||
|
||||
#ifdef HAS_ABGRTOYROW_AVX2
|
||||
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
|
||||
void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm4 \n"
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqa %6,%%ymm6 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY_AVX2(ymm7) //
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kABGRToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void ABGRToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_ABGRTOYROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBTOYJROW_AVX2
|
||||
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm4 \n"
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqa %6,%%ymm6 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY_AVX2(ymm7) //
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kARGBToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY0), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kArgbJPEGConstants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HAS_ARGBTOYJROW_AVX2
|
||||
|
||||
#ifdef HAS_ABGRTOYJROW_AVX2
|
||||
// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
|
||||
void ABGRToYJRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm4 \n"
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqa %6,%%ymm6 \n" //
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY_AVX2(ymm7) //
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kABGRToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY0), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void ABGRToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_ABGRTOYJROW_AVX2
|
||||
|
||||
#ifdef HAS_RGBATOYJROW_AVX2
|
||||
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
||||
void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm4 \n"
|
||||
"vbroadcastf128 %4,%%ymm5 \n"
|
||||
"vbroadcastf128 %5,%%ymm7 \n"
|
||||
"vmovdqa %6,%%ymm6 \n" //
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void RGBAToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
#endif
|
||||
#endif // HAS_RGBATOYJROW_AVX2
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY_AVX2(ymm7) //
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kRGBAToYJ), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY0), // %5
|
||||
"m"(kPermdARGBToY_AVX) // %6
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBTOUV444ROW_AVX2) || \
|
||||
defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
|
||||
// vpermd for vphaddw + vpackuswb vpermd.
|
||||
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psllw $15,%%xmm5 \n"
|
||||
"packsswb %%xmm5,%%xmm5 \n"
|
||||
"movdqa 0(%3),%%xmm4 \n"
|
||||
"movdqa 0x60(%3),%%xmm7 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"pmaddubsw %%xmm5,%%xmm6 \n"
|
||||
"phaddw %%xmm6,%%xmm6 \n"
|
||||
"psubw %%xmm6,%%xmm7 \n"
|
||||
LABELALIGN ""
|
||||
RGBTOY(xmm7)
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_RGBATOYJROW_AVX2
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vpsllw $15,%%ymm5,%%ymm5 \n"
|
||||
"vpacksswb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vbroadcastf128 0(%3),%%ymm4 \n"
|
||||
"vbroadcastf128 0x60(%3),%%ymm7 \n"
|
||||
"vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n"
|
||||
"vphaddw %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsubw %%ymm6,%%ymm7,%%ymm7 \n"
|
||||
"vmovdqa %4,%%ymm6 \n"
|
||||
LABELALIGN ""
|
||||
RGBTOY_AVX2(ymm7)
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c), // %3
|
||||
"m"(kPermdARGBToY_AVX) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW)
|
||||
static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15};
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n"
|
||||
"vpsllw $15,%%zmm16,%%zmm5 \n"
|
||||
"vpacksswb %%zmm5,%%zmm5,%%zmm5 \n"
|
||||
"vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1
|
||||
"vbroadcasti64x4 0(%3),%%zmm4 \n"
|
||||
"vbroadcasti64x4 0x60(%3),%%zmm7 \n"
|
||||
"vpmaddubsw %%zmm5,%%zmm4,%%zmm6 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n"
|
||||
"vpackssdw %%zmm6,%%zmm6,%%zmm6 \n"
|
||||
"vpsubw %%zmm6,%%zmm7,%%zmm7 \n"
|
||||
"vmovups %4,%%zmm6 \n"
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovups (%0),%%zmm0 \n"
|
||||
"vmovups 0x40(%0),%%zmm1 \n"
|
||||
"vmovups 0x80(%0),%%zmm2 \n"
|
||||
"vmovups 0xc0(%0),%%zmm3 \n"
|
||||
"vpsubb %%zmm5,%%zmm0,%%zmm0 \n"
|
||||
"vpsubb %%zmm5,%%zmm1,%%zmm1 \n"
|
||||
"vpsubb %%zmm5,%%zmm2,%%zmm2 \n"
|
||||
"vpsubb %%zmm5,%%zmm3,%%zmm3 \n"
|
||||
"vpmaddubsw %%zmm0,%%zmm4,%%zmm0 \n"
|
||||
"vpmaddubsw %%zmm1,%%zmm4,%%zmm1 \n"
|
||||
"vpmaddubsw %%zmm2,%%zmm4,%%zmm2 \n"
|
||||
"vpmaddubsw %%zmm3,%%zmm4,%%zmm3 \n"
|
||||
"lea 0x100(%0),%0 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n"
|
||||
"vpackssdw %%zmm1,%%zmm0,%%zmm0 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n"
|
||||
"vpmaddwd %%zmm16,%%zmm3,%%zmm3 \n"
|
||||
"vpackssdw %%zmm3,%%zmm2,%%zmm2 \n"
|
||||
"vpaddw %%zmm7,%%zmm0,%%zmm0 \n"
|
||||
"vpaddw %%zmm7,%%zmm2,%%zmm2 \n"
|
||||
"vpsrlw $0x8,%%zmm0,%%zmm0 \n"
|
||||
"vpsrlw $0x8,%%zmm2,%%zmm2 \n"
|
||||
"vpackuswb %%zmm2,%%zmm0,%%zmm0 \n"
|
||||
"vpermd %%zmm0,%%zmm6,%%zmm0 \n"
|
||||
"vmovups %%zmm0,(%1) \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"sub $0x40,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c), // %3
|
||||
"m"(kPermdARGBToY_AVX512BW) // %4
|
||||
: "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6",
|
||||
"zmm7", "zmm16");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOUV444ROW_SSSE3
|
||||
|
||||
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -1724,8 +1700,6 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
||||
#endif // HAS_ARGBTOUV444ROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBTOUV444ROW_AVX512BW
|
||||
static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15};
|
||||
|
||||
void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
@ -1977,6 +1951,62 @@ void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBTOUV444ROW_SSSE3
|
||||
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void RGBAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void BGRAToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void ARGBToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void ARGBToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kArgbJPEGConstants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void ABGRToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void ABGRToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void RGBAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void RGBAToYJRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
void BGRAToYRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_AVX512BW(src_argb, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOUV444ROW_AVX2
|
||||
void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
@ -2127,58 +2157,16 @@ void ABGRToUVJRow_AVX2(const uint8_t* src_abgr,
|
||||
}
|
||||
#endif // HAS_ABGRTOUVJROW_AVX2
|
||||
|
||||
void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7)
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kBGRAToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
void BGRAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7)
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kABGRToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
void ABGRToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN "" //
|
||||
RGBTOY(xmm7)
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kRGBAToY), // %3
|
||||
"m"(kSub128), // %4
|
||||
"m"(kAddY16) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
void RGBAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_SSSE3(src_argb, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user