YUY2ToARGB use ymm6/7 for shuffle constants

- 1 load and 2 shuffles from registers replaces 2 loads and 2 memory shuffles
- vbroadcastf128 16 byte shuffler replaces 32 byte shufflers
- bump version and apply clang-format

libyuv_test '--gunit_filter=*.???2ToARGB_Opt' --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

AMD Zen2
I422ToARGB_Opt (272 ms)
NV12ToARGB_Opt (255 ms)
YUY2ToARGB_Opt (208 ms)

Was
YUY2ToARGB_Opt (214 ms)

Change-Id: I1fa4d462d04536c877d1cab1a14586be8ed1b2f2
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5218447
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2024-01-21 08:52:02 -08:00 committed by libyuv LUCI CQ
parent 914624f0b8
commit 3e435fe6d4
3 changed files with 315 additions and 389 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1885 Version: 1886
License: BSD License: BSD
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1885 #define LIBYUV_VERSION 1886
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -137,24 +137,20 @@ static const uvec8 kShuffleMaskARGBToRGB24_0 = {
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
// YUY2 shuf 16 Y to 32 Y. // YUY2 shuf 16 Y to 32 Y.
static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, static const vec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6,
10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 8, 8, 10, 10, 12, 12, 14, 14};
6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
// YUY2 shuf 8 UV to 16 UV. // YUY2 shuf 8 UV to 16 UV.
static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, static const vec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7,
11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
// UYVY shuf 16 Y to 32 Y. // UYVY shuf 16 Y to 32 Y.
static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, static const vec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7,
11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, 9, 9, 11, 11, 13, 13, 15, 15};
7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
// UYVY shuf 8 UV to 16 UV. // UYVY shuf 8 UV to 16 UV.
static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, static const vec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6,
10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
// NV21 shuf 8 VU to 16 UV. // NV21 shuf 8 VU to 16 UV.
static const lvec8 kShuffleNV21 = { static const lvec8 kShuffleNV21 = {
@ -479,9 +475,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
} }
void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile("movdqa %3,%%xmm6 \n"
"movdqa %3,%%xmm6 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -514,13 +508,12 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kShuffleMaskARGBToRGB24) // %3 : "m"(kShuffleMaskARGBToRGB24) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
} }
void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile("movdqa %3,%%xmm6 \n"
"movdqa %3,%%xmm6 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -553,7 +546,8 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kShuffleMaskARGBToRAW) // %3 : "m"(kShuffleMaskARGBToRAW) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
} }
#ifdef HAS_ARGBTORGB24ROW_AVX2 #ifdef HAS_ARGBTORGB24ROW_AVX2
@ -1096,9 +1090,7 @@ static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11,
void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
uint16_t* dst_ar64, uint16_t* dst_ar64,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm1 \n"
@ -1113,15 +1105,14 @@ void ARGBToAR64Row_SSSE3(const uint8_t* src_argb,
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_ar64), // %1 "+r"(dst_ar64), // %1
"+r"(width) // %2 "+r"(width) // %2
: ::"memory",
: "memory", "cc", "xmm0", "xmm1"); "cc", "xmm0", "xmm1");
} }
void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
uint16_t* dst_ab64, uint16_t* dst_ab64,
int width) { int width) {
asm volatile( asm volatile(
"movdqa %3,%%xmm2 \n" "movdqa %3,%%xmm2 \n"
"movdqa %4,%%xmm3 \n" LABELALIGN "movdqa %4,%%xmm3 \n" LABELALIGN
"1: \n" "1: \n"
@ -1146,9 +1137,7 @@ void ARGBToAB64Row_SSSE3(const uint8_t* src_argb,
void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1163,16 +1152,16 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64,
: "+r"(src_ar64), // %0 : "+r"(src_ar64), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: ::"memory",
: "memory", "cc", "xmm0", "xmm1"); "cc", "xmm0", "xmm1");
} }
void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile("movdqa %3,%%xmm2 \n"
"movdqa %3,%%xmm2 \n" LABELALIGN LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1196,9 +1185,7 @@ void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64,
void ARGBToAR64Row_AVX2(const uint8_t* src_argb, void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
uint16_t* dst_ar64, uint16_t* dst_ar64,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n"
@ -1214,8 +1201,8 @@ void ARGBToAR64Row_AVX2(const uint8_t* src_argb,
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_ar64), // %1 "+r"(dst_ar64), // %1
"+r"(width) // %2 "+r"(width) // %2
: ::"memory",
: "memory", "cc", "xmm0", "xmm1"); "cc", "xmm0", "xmm1");
} }
#endif #endif
@ -1224,7 +1211,6 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
uint16_t* dst_ab64, uint16_t* dst_ab64,
int width) { int width) {
asm volatile( asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" "vbroadcastf128 %3,%%ymm2 \n"
"vbroadcastf128 %4,%%ymm3 \n" LABELALIGN "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN
"1: \n" "1: \n"
@ -1252,9 +1238,7 @@ void ARGBToAB64Row_AVX2(const uint8_t* src_argb,
void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -1271,8 +1255,8 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
: "+r"(src_ar64), // %0 : "+r"(src_ar64), // %0
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: ::"memory",
: "memory", "cc", "xmm0", "xmm1"); "cc", "xmm0", "xmm1");
} }
#endif #endif
@ -1280,9 +1264,7 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64,
void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile("vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
"vbroadcastf128 %3,%%ymm2 \n" LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -2467,21 +2449,25 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"punpcklbw %%xmm4,%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \
"lea 0x8(%[y_buf]),%[y_buf] \n" "lea 0x8(%[y_buf]),%[y_buf] \n"
// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
// xmm6 kShuffleYUY2Y,
// xmm7 kShuffleYUY2UV
#define READYUY2 \ #define READYUY2 \
"movdqu (%[yuy2_buf]),%%xmm4 \n" \ "movdqu (%[yuy2_buf]),%%xmm4 \n" \
"pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" \
"movdqu (%[yuy2_buf]),%%xmm3 \n" \ "movdqa %%xmm4,%%xmm3 \n" \
"pshufb %[kShuffleYUY2UV], %%xmm3 \n" \ "pshufb %%xmm6,%%xmm4 \n" \
"lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" "pshufb %%xmm7,%%xmm3 \n"
// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
// xmm6 kShuffleUYVYY,
// xmm7 kShuffleUYVYUV
#define READUYVY \ #define READUYVY \
"movdqu (%[uyvy_buf]),%%xmm4 \n" \ "movdqu (%[uyvy_buf]),%%xmm4 \n" \
"pshufb %[kShuffleUYVYY], %%xmm4 \n" \ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" \
"movdqu (%[uyvy_buf]),%%xmm3 \n" \ "movdqa %%xmm4,%%xmm3 \n" \
"pshufb %[kShuffleUYVYUV], %%xmm3 \n" \ "pshufb %%xmm6,%%xmm4 \n" \
"lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" "pshufb %%xmm7,%%xmm3 \n"
// Read 4 UV from P210, upsample to 8 UV // Read 4 UV from P210, upsample to 8 UV
#define READP210 \ #define READP210 \
@ -3200,6 +3186,8 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
int width) { int width) {
// clang-format off // clang-format off
asm volatile ( asm volatile (
"movdqa %[kShuffleYUY2Y],%%xmm6 \n"
"movdqa %[kShuffleYUY2UV],%%xmm7 \n"
YUVTORGB_SETUP(yuvconstants) YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
@ -3217,7 +3205,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
[kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
[kShuffleYUY2UV]"m"(kShuffleYUY2UV) [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
: "memory", "cc", YUVTORGB_REGS : "memory", "cc", YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
// clang-format on // clang-format on
} }
@ -3228,6 +3216,8 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
int width) { int width) {
// clang-format off // clang-format off
asm volatile ( asm volatile (
"movdqa %[kShuffleUYVYY],%%xmm6 \n"
"movdqa %[kShuffleUYVYUV],%%xmm7 \n"
YUVTORGB_SETUP(yuvconstants) YUVTORGB_SETUP(yuvconstants)
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
@ -3598,19 +3588,21 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"lea 0x20(%[y_buf]),%[y_buf] \n" "lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
// ymm6 kShuffleYUY2Y,
// ymm7 kShuffleYUY2UV
#define READYUY2_AVX2 \ #define READYUY2_AVX2 \
"vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ "vmovdqu (%[yuy2_buf]),%%ymm1 \n" \
"vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ "vpshufb %%ymm6,%%ymm1,%%ymm4 \n" \
"vmovdqu (%[yuy2_buf]),%%ymm3 \n" \ "vpshufb %%ymm7,%%ymm1,%%ymm3 \n" \
"vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \
"lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n"
// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
// ymm6 kShuffleUYVYY,
// ymm7 kShuffleUYVYUV
#define READUYVY_AVX2 \ #define READUYVY_AVX2 \
"vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ "vmovdqu (%[uyvy_buf]),%%ymm1 \n" \
"vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ "vpshufb %%ymm6,%%ymm1,%%ymm4 \n" \
"vmovdqu (%[uyvy_buf]),%%ymm3 \n" \ "vpshufb %%ymm7,%%ymm1,%%ymm3 \n" \
"vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \
"lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n"
// TODO(fbarchard): Remove broadcastb // TODO(fbarchard): Remove broadcastb
@ -4414,6 +4406,8 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
int width) { int width) {
// clang-format off // clang-format off
asm volatile ( asm volatile (
"vbroadcastf128 %[kShuffleYUY2Y],%%ymm6 \n"
"vbroadcastf128 %[kShuffleYUY2UV],%%ymm7 \n"
YUVTORGB_SETUP_AVX2(yuvconstants) YUVTORGB_SETUP_AVX2(yuvconstants)
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
@ -4432,7 +4426,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
[kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
[kShuffleYUY2UV]"m"(kShuffleYUY2UV) [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
: "memory", "cc", YUVTORGB_REGS_AVX2 : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
// clang-format on // clang-format on
} }
@ -4447,6 +4441,8 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
int width) { int width) {
// clang-format off // clang-format off
asm volatile ( asm volatile (
"vbroadcastf128 %[kShuffleUYVYY],%%ymm6 \n"
"vbroadcastf128 %[kShuffleUYVYUV],%%ymm7 \n"
YUVTORGB_SETUP_AVX2(yuvconstants) YUVTORGB_SETUP_AVX2(yuvconstants)
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
@ -4465,7 +4461,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
[kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYY]"m"(kShuffleUYVYY),
[kShuffleUYVYUV]"m"(kShuffleUYVYUV) [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
: "memory", "cc", YUVTORGB_REGS_AVX2 : "memory", "cc", YUVTORGB_REGS_AVX2
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
); );
// clang-format on // clang-format on
} }
@ -4705,9 +4701,7 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile( asm volatile("movdqa %3,%%xmm5 \n"
"movdqa %3,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4728,9 +4722,7 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
#ifdef HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_AVX2
void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile( asm volatile("vbroadcastf128 %3,%%ymm5 \n"
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4757,9 +4749,7 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile( asm volatile("movdqa %3,%%xmm5 \n"
"movdqa %3,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4780,9 +4770,7 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
#ifdef HAS_MIRRORUVROW_AVX2 #ifdef HAS_MIRRORUVROW_AVX2
void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile( asm volatile("vbroadcastf128 %3,%%ymm5 \n"
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4886,9 +4874,7 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24,
void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile( asm volatile("lea -0x10(%0,%2,4),%0 \n"
"lea -0x10(%0,%2,4),%0 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -4912,9 +4898,7 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
intptr_t temp_width = (intptr_t)(width); intptr_t temp_width = (intptr_t)(width);
asm volatile( asm volatile("vmovdqu %3,%%ymm5 \n"
"vmovdqu %3,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -5563,9 +5547,7 @@ void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -5639,9 +5621,7 @@ void MergeRGBRow_SSSE3(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_rgb, uint8_t* dst_rgb,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu (%1),%%xmm1 \n" "movdqu (%1),%%xmm1 \n"
@ -5697,7 +5677,6 @@ void MergeARGBRow_SSE2(const uint8_t* src_r,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"sub %0,%3 \n" "sub %0,%3 \n"
@ -5738,9 +5717,7 @@ void MergeXRGBRow_SSE2(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movq (%2),%%xmm0 \n" // B "movq (%2),%%xmm0 \n" // B
@ -5779,7 +5756,6 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"sub %0,%3 \n" "sub %0,%3 \n"
@ -5830,7 +5806,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
"1: \n" "1: \n"
"vmovdqu (%2),%%xmm0 \n" // B "vmovdqu (%2),%%xmm0 \n" // B
"vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255) "vpcmpeqb %%ymm1,%%ymm1,%%ymm1 \n" // A(255)
"vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R
"vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G
"vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n"
@ -5856,8 +5832,8 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
"+r"(src_b), // %2 "+r"(src_b), // %2
"+r"(dst_argb), // %3 "+r"(dst_argb), // %3
"+rm"(width) // %4 "+rm"(width) // %4
: ::"memory",
: "memory", "cc", "xmm0", "xmm1", "xmm2"); "cc", "xmm0", "xmm1", "xmm2");
} }
#endif // HAS_MERGEARGBROW_AVX2 #endif // HAS_MERGEARGBROW_AVX2
@ -5869,7 +5845,6 @@ void SplitARGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile( asm volatile(
"sub %1,%2 \n" "sub %1,%2 \n"
"sub %1,%3 \n" "sub %1,%3 \n"
"sub %1,%4 \n" "sub %1,%4 \n"
@ -5921,7 +5896,6 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile( asm volatile(
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -5972,7 +5946,6 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile( asm volatile(
"movdqa %6,%%xmm3 \n" "movdqa %6,%%xmm3 \n"
"sub %1,%2 \n" "sub %1,%2 \n"
"sub %1,%3 \n" "sub %1,%3 \n"
@ -6019,7 +5992,6 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile( asm volatile(
"movdqa %5,%%xmm3 \n" "movdqa %5,%%xmm3 \n"
LABELALIGN LABELALIGN
@ -6061,7 +6033,6 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile( asm volatile(
"sub %1,%2 \n" "sub %1,%2 \n"
"sub %1,%3 \n" "sub %1,%3 \n"
"sub %1,%4 \n" "sub %1,%4 \n"
@ -6113,7 +6084,6 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile( asm volatile(
"vmovdqa %6,%%ymm3 \n" "vmovdqa %6,%%ymm3 \n"
"vbroadcastf128 %5,%%ymm4 \n" "vbroadcastf128 %5,%%ymm4 \n"
@ -6161,7 +6131,6 @@ void MergeXR30Row_AVX2(const uint16_t* src_r,
int width) { int width) {
int shift = depth - 10; int shift = depth - 10;
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants
@ -6228,7 +6197,6 @@ void MergeAR64Row_AVX2(const uint16_t* src_r,
int mask = (1 << depth) - 1; int mask = (1 << depth) - 1;
mask = (mask << 16) + mask; mask = (mask << 16) + mask;
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"sub %0,%3 \n" "sub %0,%3 \n"
@ -6300,7 +6268,6 @@ void MergeXR64Row_AVX2(const uint16_t* src_r,
int mask = (1 << depth) - 1; int mask = (1 << depth) - 1;
mask = (mask << 16) + mask; mask = (mask << 16) + mask;
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"vmovdqa %7,%%ymm5 \n" "vmovdqa %7,%%ymm5 \n"
@ -6364,7 +6331,6 @@ void MergeARGB16To8Row_AVX2(const uint16_t* src_r,
int width) { int width) {
int shift = depth - 8; int shift = depth - 8;
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"sub %0,%3 \n" "sub %0,%3 \n"
@ -6421,7 +6387,6 @@ void MergeXRGB16To8Row_AVX2(const uint16_t* src_r,
int width) { int width) {
int shift = depth - 8; int shift = depth - 8;
asm volatile( asm volatile(
"sub %0,%1 \n" "sub %0,%1 \n"
"sub %0,%2 \n" "sub %0,%2 \n"
"vbroadcastf128 %6,%%ymm5 \n" "vbroadcastf128 %6,%%ymm5 \n"
@ -6505,9 +6470,7 @@ void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
#ifdef HAS_COPYROW_AVX #ifdef HAS_COPYROW_AVX
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -6530,9 +6493,7 @@ void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
// Multiple of 1. // Multiple of 1.
void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width);
asm volatile( asm volatile("rep movsb \n"
"rep movsb \n"
: "+S"(src), // %0 : "+S"(src), // %0
"+D"(dst), // %1 "+D"(dst), // %1
"+c"(width_tmp) // %2 "+c"(width_tmp) // %2
@ -6609,9 +6570,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0), %%xmm0 \n" "movdqu (%0), %%xmm0 \n"
"movdqu 0x10(%0), %%xmm1 \n" "movdqu 0x10(%0), %%xmm1 \n"
@ -6744,9 +6703,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width >> 2); size_t width_tmp = (size_t)(width >> 2);
const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
asm volatile( asm volatile("rep stosl \n"
"rep stosl \n"
: "+D"(dst), // %0 : "+D"(dst), // %0
"+c"(width_tmp) // %1 "+c"(width_tmp) // %1
: "a"(v32) // %2 : "a"(v32) // %2
@ -6755,9 +6712,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width);
asm volatile( asm volatile("rep stosb \n"
"rep stosb \n"
: "+D"(dst), // %0 : "+D"(dst), // %0
"+c"(width_tmp) // %1 "+c"(width_tmp) // %1
: "a"(v8) // %2 : "a"(v8) // %2
@ -6766,9 +6721,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
size_t width_tmp = (size_t)(width); size_t width_tmp = (size_t)(width);
asm volatile( asm volatile("rep stosl \n"
"rep stosl \n"
: "+D"(dst_argb), // %0 : "+D"(dst_argb), // %0
"+c"(width_tmp) // %1 "+c"(width_tmp) // %1
: "a"(v32) // %2 : "a"(v32) // %2
@ -6904,9 +6857,7 @@ void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
} }
void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -7032,9 +6983,7 @@ void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2, int stride_yuy2,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -7137,9 +7086,7 @@ void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
} }
void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile( asm volatile(LABELALIGN
LABELALIGN
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -7935,9 +7882,7 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile("pxor %%xmm5,%%xmm5 \n"
"pxor %%xmm5,%%xmm5 \n"
// 4 pixel loop. // 4 pixel loop.
LABELALIGN LABELALIGN
@ -7974,9 +7919,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile( asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
// 4 pixel loop. // 4 pixel loop.
LABELALIGN LABELALIGN
@ -8823,9 +8766,7 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const uint8_t* shuffler, const uint8_t* shuffler,
int width) { int width) {
asm volatile( asm volatile("movdqu (%3),%%xmm5 \n"
"movdqu (%3),%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -8853,9 +8794,7 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const uint8_t* shuffler, const uint8_t* shuffler,
int width) { int width) {
asm volatile( asm volatile("vbroadcastf128 (%3),%%ymm5 \n"
"vbroadcastf128 (%3),%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -8884,9 +8823,7 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile( asm volatile("sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -8920,9 +8857,7 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uyvy, uint8_t* dst_uyvy,
int width) { int width) {
asm volatile( asm volatile("sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -8956,9 +8891,7 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile( asm volatile("sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -8995,9 +8928,7 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uyvy, uint8_t* dst_uyvy,
int width) { int width) {
asm volatile( asm volatile("sub %1,%2 \n"
"sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -9033,9 +8964,7 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const float* poly, const float* poly,
int width) { int width) {
asm volatile( asm volatile("pxor %%xmm3,%%xmm3 \n"
"pxor %%xmm3,%%xmm3 \n"
// 2 pixel loop. // 2 pixel loop.
LABELALIGN LABELALIGN
@ -9080,7 +9009,8 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
"+r"(dst_argb), // %1 "+r"(dst_argb), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(poly) // %3 : "r"(poly) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
} }
#endif // HAS_ARGBPOLYNOMIALROW_SSE2 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
@ -9572,9 +9502,7 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
// Convert UV plane of NV12 to VU of NV21. // Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile( asm volatile("movdqu %3,%%xmm5 \n"
"movdqu %3,%%xmm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -9598,9 +9526,7 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
#ifdef HAS_SWAPUVROW_AVX2 #ifdef HAS_SWAPUVROW_AVX2
void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile( asm volatile("vbroadcastf128 %3,%%ymm5 \n"
"vbroadcastf128 %3,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"