mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
add RGBToYMatrixRow_AVX2
Adds RGBToYMatrixRow_AVX2 which reads 24 bit RGB values by reading 3 vectors instead of 4 and permutes them into 4 ARGB vectors before conversion. Also adds RGBToYMatrixRow_Opt and RGBToYMatrixRow_2Step_Opt to convert_argb_test.cc to benchmark and compare the direct AVX2 conversion vs a 2-step approach. ./libyuv_test '--gunit_filter=*RAWToJ400_Opt' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=10000 --libyuv_flags=-1 --libyuv_cpu_info=-1 AMD Zen 5 Was LibYUVConvertTest.RAWToJ400_Opt (757 ms) Now LibYUVConvertTest.RAWToJ400_Opt (699 ms) Intel Skylake Was LibYUVConvertTest.RAWToJ400_Opt (1705 ms) Now LibYUVConvertTest.RAWToJ400_Opt (1426 ms) Bug: 477295731 Change-Id: I29866baf4ad5fe7a3725e4a01f2fe24649510a7d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7777325 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Justin Green <greenjustin@google.com> Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
ddc6764d13
commit
9f13b2814d
@ -280,8 +280,11 @@ extern "C" {
|
||||
#define HAS_ABGRTOAR30ROW_AVX2
|
||||
#define HAS_ABGRTOUVJROW_AVX2
|
||||
#define HAS_ABGRTOUVROW_AVX2
|
||||
#if defined(__x86_64__)
|
||||
#define HAS_ABGRTOYJROW_AVX2
|
||||
#define HAS_ABGRTOYROW_AVX2
|
||||
#define HAS_ARGBTOYROW_AVX2
|
||||
#endif
|
||||
#define HAS_AR64TOARGBROW_AVX2
|
||||
#define HAS_ARGBATTENUATEROW_AVX2
|
||||
#define HAS_ARGBTOAB64ROW_AVX2
|
||||
@ -295,8 +298,9 @@ extern "C" {
|
||||
#define HAS_ARGBTOUVROW_AVX2
|
||||
#define HAS_ARGBTOUVMATRIXROW_AVX2
|
||||
#define HAS_ARGBTOUV444MATRIXROW_AVX2
|
||||
#define HAS_ARGBTOYJROW_AVX2
|
||||
#define HAS_ARGBTOYROW_AVX2
|
||||
#if defined(__x86_64__)
|
||||
#define HAS_RGBTOYMATRIXROW_AVX2
|
||||
#endif
|
||||
#define HAS_ARGBUNATTENUATEROW_AVX2
|
||||
#define HAS_CONVERT16TO8ROW_AVX2
|
||||
#define HAS_CONVERT8TO16ROW_AVX2
|
||||
@ -330,9 +334,11 @@ extern "C" {
|
||||
#define HAS_P210TOARGBROW_AVX2
|
||||
#define HAS_P410TOAR30ROW_AVX2
|
||||
#define HAS_P410TOARGBROW_AVX2
|
||||
#if defined(__x86_64__)
|
||||
#define HAS_RAWTOYJROW_AVX2
|
||||
#define HAS_RGB24TOYJROW_AVX2
|
||||
#define HAS_RGBATOYJROW_AVX2
|
||||
#endif
|
||||
#define HAS_SPLITARGBROW_AVX2
|
||||
#define HAS_SPLITRGBROW_AVX2
|
||||
#define HAS_SPLITUVROW_16_AVX2
|
||||
@ -354,6 +360,7 @@ extern "C" {
|
||||
defined(_M_X64) || defined(_M_X86)) && \
|
||||
((defined(_MSC_VER) && !defined(__clang__)) || \
|
||||
defined(LIBYUV_ENABLE_ROWWIN))
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define HAS_ARGBTOYROW_AVX2
|
||||
#define HAS_ABGRTOYROW_AVX2
|
||||
#define HAS_ARGBTOYJROW_AVX2
|
||||
@ -362,6 +369,7 @@ extern "C" {
|
||||
#define HAS_RGBATOYROW_AVX2
|
||||
#define HAS_BGRATOYROW_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// The following are available for AVX512 clang x86 platforms:
|
||||
// TODO(fbarchard): Port to GCC and Visual C
|
||||
@ -1857,6 +1865,10 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void RGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -1866,6 +1878,10 @@ void ARGBToYMatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void RGBToYMatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
#endif
|
||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
@ -2220,6 +2236,12 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
#if defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
void RGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
#endif
|
||||
void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
@ -2232,6 +2254,12 @@ void ARGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
#if defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
void RGBToYMatrixRow_Any_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
#endif
|
||||
void ARGBToYMatrixRow_Any_AVX512BW(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
|
||||
@ -2925,6 +2925,7 @@ int RGBAToI420(const uint8_t* src_rgba,
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_LSX) || \
|
||||
defined(HAS_RGB24TOYROW_AVX2) || \
|
||||
defined(HAS_RGB24TOYROW_RVV))
|
||||
#define HAS_RGB24TOYROW
|
||||
#endif
|
||||
@ -2969,6 +2970,16 @@ int RGB24ToI420(const uint8_t* src_rgb24,
|
||||
|
||||
#if defined(HAS_RGB24TOYROW)
|
||||
|
||||
#if defined(HAS_RGB24TOYROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
// TODO(fbarchard): Write an AVX2 function for RGB24ToUVRow.
|
||||
RGB24ToYRow = RGB24ToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
RGB24ToYRow = RGB24ToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Neon version does direct RGB24 to YUV.
|
||||
#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
@ -3112,7 +3123,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
|
||||
#undef HAS_RGB24TOYROW
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_RVV)
|
||||
#if defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_AVX2) || defined(HAS_RGB24TOYJROW_RVV)
|
||||
#define HAS_RGB24TOYJROW
|
||||
#endif
|
||||
|
||||
@ -3156,6 +3167,15 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
|
||||
|
||||
#if defined(HAS_RGB24TOYJROW)
|
||||
|
||||
#if defined(HAS_RGB24TOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
RGB24ToYJRow = RGB24ToYJRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Neon version does direct RGB24 to YUV.
|
||||
#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
@ -3288,6 +3308,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_LSX) || \
|
||||
defined(HAS_RAWTOYROW_AVX2) || \
|
||||
defined(HAS_RAWTOYROW_RVV))
|
||||
#define HAS_RAWTOYROW
|
||||
#endif
|
||||
@ -3331,6 +3352,16 @@ int RAWToI420(const uint8_t* src_raw,
|
||||
|
||||
#if defined(HAS_RAWTOYROW)
|
||||
|
||||
#if defined(HAS_RAWTOYROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
// TODO(fbarchard): Write an AVX2 function for RAWToUVRow.
|
||||
RAWToYRow = RAWToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
RAWToYRow = RAWToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Neon version does direct RAW to YUV.
|
||||
#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
@ -3482,7 +3513,7 @@ int RAWToI420(const uint8_t* src_raw,
|
||||
#undef HAS_RAWTOYROW
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
|
||||
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_AVX2) || defined(HAS_RAWTOYJROW_RVV)
|
||||
#define HAS_RAWTOYJROW
|
||||
#endif
|
||||
|
||||
@ -3526,6 +3557,15 @@ int RAWToJ420(const uint8_t* src_raw,
|
||||
|
||||
#if defined(HAS_RAWTOYJROW)
|
||||
|
||||
#if defined(HAS_RAWTOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RAWToYJRow = RAWToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
RAWToYJRow = RAWToYJRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Neon version does direct RAW to YUV.
|
||||
#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
@ -4717,7 +4757,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RGB24TOYJROW_AVX2)
|
||||
#if defined(HAS_RGB24TOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RGB24ToYJRow = RGB24ToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
@ -4798,7 +4838,7 @@ int RAWToJ400(const uint8_t* src_raw,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_RAWTOYJROW_AVX2)
|
||||
#if defined(HAS_RAWTOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
RAWToYJRow = RAWToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
|
||||
@ -3979,7 +3979,7 @@ int ARGBToAB64(const uint8_t* src_argb,
|
||||
}
|
||||
|
||||
// Enabled if 1 pass is available
|
||||
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_RVV)
|
||||
#if defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_AVX2) || defined(HAS_RAWTOYJROW_RVV)
|
||||
#define HAS_RAWTOYJROW
|
||||
#endif
|
||||
|
||||
@ -4024,6 +4024,16 @@ int RAWToJNV21(const uint8_t* src_raw,
|
||||
|
||||
#if defined(HAS_RAWTOYJROW)
|
||||
|
||||
#if defined(HAS_RAWTOYJROW_AVX2) && defined(HAS_RGBTOYMATRIXROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
// TODO(fbarchard): Write an AVX2 function for RAWToUVJRow.
|
||||
RAWToYJRow = RAWToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
RAWToYJRow = RAWToYJRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Neon version does direct RAW to YUV.
|
||||
#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
|
||||
@ -1202,6 +1202,9 @@ ANY11(RGBAToYRow_Any_LASX, RGBAToYRow_LASX, 0, 4, 1, 31)
|
||||
#ifdef HAS_RGB24TOYROW_NEON
|
||||
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOYROW_AVX2
|
||||
ANY11(RGB24ToYRow_Any_AVX2, RGB24ToYRow_AVX2, 0, 3, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOYJROW_AVX2
|
||||
ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
|
||||
#endif
|
||||
@ -1226,6 +1229,9 @@ ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
|
||||
#ifdef HAS_RAWTOYROW_NEON
|
||||
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOYROW_AVX2
|
||||
ANY11(RAWToYRow_Any_AVX2, RAWToYRow_AVX2, 0, 3, 1, 31)
|
||||
#endif
|
||||
#ifdef HAS_RAWTOYJROW_AVX2
|
||||
ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
|
||||
#endif
|
||||
@ -2315,8 +2321,9 @@ ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
ANY11MC(ARGBToYMatrixRow_Any_SSSE3, ARGBToYMatrixRow_SSSE3, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
#ifdef HAS_ARGBTOYMATRIXROW_AVX2
|
||||
ANY11MC(ARGBToYMatrixRow_Any_AVX2, ARGBToYMatrixRow_AVX2, 4, 31)
|
||||
ANY11MC(RGBToYMatrixRow_Any_AVX2, RGBToYMatrixRow_AVX2, 3, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_AVX512BW
|
||||
ANY11MC(ARGBToYMatrixRow_Any_AVX512BW, ARGBToYMatrixRow_AVX512BW, 4, 63)
|
||||
|
||||
@ -4377,42 +4377,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_RGB24TOYJROW_AVX2
|
||||
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
|
||||
void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
// Row buffer for intermediate ARGB pixels.
|
||||
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth);
|
||||
ARGBToYJRow_AVX2(row, dst_yj, twidth);
|
||||
src_rgb24 += twidth * 3;
|
||||
dst_yj += twidth;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif // HAS_RGB24TOYJROW_AVX2
|
||||
|
||||
#ifdef HAS_RAWTOYJROW_AVX2
|
||||
// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
|
||||
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||
// Row buffer for intermediate ARGB pixels.
|
||||
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
#ifdef HAS_RAWTOARGBROW_AVX2
|
||||
RAWToARGBRow_AVX2(src_raw, row, twidth);
|
||||
#else
|
||||
RAWToARGBRow_SSSE3(src_raw, row, twidth);
|
||||
#endif
|
||||
ARGBToYJRow_AVX2(row, dst_yj, twidth);
|
||||
src_raw += twidth * 3;
|
||||
dst_yj += twidth;
|
||||
width -= twidth;
|
||||
}
|
||||
}
|
||||
#endif // HAS_RAWTOYJROW_AVX2
|
||||
|
||||
#ifdef HAS_RGB24TOYJROW_SSSE3
|
||||
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
|
||||
void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
|
||||
@ -1347,6 +1347,46 @@ void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
|
||||
"sub $0x20,%2 \n" \
|
||||
"jg 1b \n"
|
||||
|
||||
#if defined(__x86_64__)
|
||||
#define RGB24ToY_AVX2(round) \
|
||||
"1: \n" \
|
||||
"vmovdqu (%0),%%ymm0 \n" \
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n" \
|
||||
"vmovdqu 0x40(%0),%%ymm2 \n" \
|
||||
"vperm2i128 $0x21,%%ymm1,%%ymm0,%%ymm3 \n" \
|
||||
"vperm2i128 $0x21,%%ymm2,%%ymm1,%%ymm11 \n" \
|
||||
"vpermd %%ymm0,%%ymm8,%%ymm0 \n" \
|
||||
"vpermd %%ymm3,%%ymm9,%%ymm1 \n" \
|
||||
"vpermd %%ymm2,%%ymm9,%%ymm3 \n" \
|
||||
"vpermd %%ymm11,%%ymm8,%%ymm2 \n" \
|
||||
"vpshufb %%ymm10,%%ymm0,%%ymm0 \n" \
|
||||
"vpshufb %%ymm10,%%ymm1,%%ymm1 \n" \
|
||||
"vpshufb %%ymm10,%%ymm2,%%ymm2 \n" \
|
||||
"vpshufb %%ymm10,%%ymm3,%%ymm3 \n" \
|
||||
"vpsubb %%ymm5,%%ymm0,%%ymm0 \n" \
|
||||
"vpsubb %%ymm5,%%ymm1,%%ymm1 \n" \
|
||||
"vpsubb %%ymm5,%%ymm2,%%ymm2 \n" \
|
||||
"vpsubb %%ymm5,%%ymm3,%%ymm3 \n" \
|
||||
"vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \
|
||||
"vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \
|
||||
"vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \
|
||||
"vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \
|
||||
"lea 0x60(%0),%0 \n" \
|
||||
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" \
|
||||
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
|
||||
"prefetcht0 1280(%0) \n" \
|
||||
"vpaddw %%" #round ",%%ymm0,%%ymm0 \n" \
|
||||
"vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
|
||||
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" \
|
||||
"vpermd %%ymm0,%%ymm6,%%ymm0 \n" \
|
||||
"vmovdqu %%ymm0,(%1) \n" \
|
||||
"lea 0x20(%1),%1 \n" \
|
||||
"sub $0x20,%2 \n" \
|
||||
"jg 1b \n"
|
||||
#endif
|
||||
|
||||
// clang-format on
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
@ -1433,10 +1473,60 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
#endif
|
||||
#endif // HAS_RGBATOYJROW_AVX2
|
||||
|
||||
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
|
||||
void RGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
RGBToYMatrixRow_AVX2(src_rgb24, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||
RGBToYMatrixRow_AVX2(src_raw, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
void RGB24ToYRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
||||
RGBToYMatrixRow_AVX2(src_rgb24, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void RAWToYRow_AVX2(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
||||
RGBToYMatrixRow_AVX2(src_raw, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
||||
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
||||
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBTOUV444ROW_AVX2) || \
|
||||
defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
|
||||
// vpermd for vphaddw + vpackuswb vpermd.
|
||||
static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
#if defined(__x86_64__)
|
||||
static const lvec32 kPermdRGB24_0_AVX2 = {0, 1, 2, 0, 3, 4, 5, 0};
|
||||
static const lvec32 kPermdRGB24_1_AVX2 = {2, 3, 4, 0, 5, 6, 7, 0};
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
@ -1491,6 +1581,40 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
#if defined(__x86_64__)
|
||||
void RGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vpsllw $15,%%ymm5,%%ymm5 \n"
|
||||
"vpacksswb %%ymm5,%%ymm5,%%ymm5 \n"
|
||||
"vbroadcastf128 0(%3),%%ymm4 \n"
|
||||
"vbroadcastf128 0x60(%3),%%ymm7 \n"
|
||||
"vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n"
|
||||
"vphaddw %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsubw %%ymm6,%%ymm7,%%ymm7 \n"
|
||||
"vmovdqa %4,%%ymm6 \n"
|
||||
"vmovdqa %5,%%ymm8 \n"
|
||||
"vmovdqa %6,%%ymm9 \n"
|
||||
"vbroadcastf128 %7,%%ymm10 \n"
|
||||
LABELALIGN ""
|
||||
RGB24ToY_AVX2(ymm7)
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c), // %3
|
||||
"m"(kPermdARGBToY_AVX), // %4
|
||||
"m"(kPermdRGB24_0_AVX2),// %5
|
||||
"m"(kPermdRGB24_1_AVX2),// %6
|
||||
"m"(kShuffleMaskRGB24ToARGB) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7", "xmm8", "xmm9", "xmm10", "xmm11");
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW)
|
||||
|
||||
@ -1706,6 +1706,78 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
|
||||
}
|
||||
#endif // HAS_ARGBTOAR30ROW_AVX2
|
||||
|
||||
#ifdef HAS_RGBTOYMATRIXROW_AVX2
|
||||
TEST_F(LibYUVConvertTest, RGBToYMatrixRow_Opt) {
|
||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
|
||||
align_buffer_page_end(src, kPixels * 3);
|
||||
align_buffer_page_end(dst_opt, kPixels);
|
||||
align_buffer_page_end(dst_c, kPixels);
|
||||
MemRandomize(src, kPixels * 3);
|
||||
memset(dst_opt, 0, kPixels);
|
||||
memset(dst_c, 1, kPixels);
|
||||
|
||||
// We test with kArgbI601Constants since it's commonly used.
|
||||
// We use ARGBToYMatrixRow_C for the C reference because we adapted
|
||||
// ARGBToYMatrixRow_AVX2 to read 24-bit values. But wait, ARGBToYMatrixRow_C
|
||||
// expects ARGB format (4 bytes).
|
||||
// I need to create a simple reference loop or use a C function.
|
||||
// I will just convert the RGB24 to ARGB32 and then call ARGBToYMatrixRow_C.
|
||||
align_buffer_page_end(src_argb, kPixels * 4);
|
||||
RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1);
|
||||
ARGBToYMatrixRow_C(src_argb, dst_c, kPixels, &kArgbI601Constants);
|
||||
|
||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
if (has_avx2) {
|
||||
RGBToYMatrixRow_AVX2(src, dst_opt, kPixels, &kArgbI601Constants);
|
||||
} else {
|
||||
ARGBToYMatrixRow_C(src_argb, dst_opt, kPixels, &kArgbI601Constants);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < kPixels; ++i) {
|
||||
EXPECT_EQ(dst_opt[i], dst_c[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(src_argb);
|
||||
free_aligned_buffer_page_end(src);
|
||||
free_aligned_buffer_page_end(dst_opt);
|
||||
free_aligned_buffer_page_end(dst_c);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVConvertTest, RGBToYMatrixRow_2Step_Opt) {
|
||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31;
|
||||
align_buffer_page_end(src, kPixels * 3);
|
||||
align_buffer_page_end(src_argb, kPixels * 4);
|
||||
align_buffer_page_end(dst_opt, kPixels);
|
||||
align_buffer_page_end(dst_c, kPixels);
|
||||
MemRandomize(src, kPixels * 3);
|
||||
memset(dst_opt, 0, kPixels);
|
||||
memset(dst_c, 1, kPixels);
|
||||
|
||||
RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1);
|
||||
ARGBToYMatrixRow_C(src_argb, dst_c, kPixels, &kArgbI601Constants);
|
||||
|
||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
if (has_avx2) {
|
||||
RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1);
|
||||
ARGBToYMatrixRow_AVX2(src_argb, dst_opt, kPixels, &kArgbI601Constants);
|
||||
} else {
|
||||
RGB24ToARGB(src, kPixels * 3, src_argb, kPixels * 4, kPixels, 1);
|
||||
ARGBToYMatrixRow_C(src_argb, dst_opt, kPixels, &kArgbI601Constants);
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < kPixels; ++i) {
|
||||
EXPECT_EQ(dst_opt[i], dst_c[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(src_argb);
|
||||
free_aligned_buffer_page_end(src);
|
||||
free_aligned_buffer_page_end(dst_opt);
|
||||
free_aligned_buffer_page_end(dst_c);
|
||||
}
|
||||
#endif // HAS_RGBTOYMATRIXROW_AVX2
|
||||
|
||||
#ifdef HAS_ABGRTOAR30ROW_AVX2
|
||||
TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
|
||||
// ABGRToAR30Row_AVX2 expects a multiple of 8 pixels.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user