Transpose 4x4 for SSE2 and AVX2

Skylake Xeon
AVX2 Transpose4x4_Opt (290 ms)
SSE2 Transpose4x4_Opt (302 ms)
C    Transpose4x4_Opt (522 ms)

AMD Zen2
AVX2 Transpose4x4_Opt (136 ms)
SSE2 Transpose4x4_Opt (137 ms)
C    Transpose4x4_Opt (431 ms)

Bug: None
Change-Id: I4997dbd5c5387c22bfd6c5960b421504e4bc8a2a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4292946
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2023-02-27 01:23:59 -08:00 committed by libyuv LUCI CQ
parent e66f436560
commit f9b23b9cc0
5 changed files with 216 additions and 20 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1861
Version: 1862
License: BSD
License File: LICENSE

View File

@ -42,6 +42,8 @@ extern "C" {
// The following are available for GCC 32 or 64 bit:
#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
#define HAS_TRANSPOSEWX8_SSSE3
#define HAS_TRANSPOSE4X4_32_SSE2
#define HAS_TRANSPOSE4X4_32_AVX2
#endif
// The following are available for 64 bit GCC:
@ -56,6 +58,11 @@ extern "C" {
#define HAS_TRANSPOSEUVWX8_NEON
#endif
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(LIBYUV_NEON) || defined(__aarch64__))
#define HAS_TRANSPOSE4X4_32_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
#define HAS_TRANSPOSEWX16_MSA
#define HAS_TRANSPOSEUVWX16_MSA
@ -240,19 +247,24 @@ void Transpose4x4_32_NEON(const uint8_t* src,
int dst_stride,
int width);
void Transpose4x4_32_SSE2(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
void Transpose4x4_32_AVX2(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
void Transpose4x4_32_C(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
// Transpose 32 bit values (ARGB)
void Transpose8x8_32_NEON(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1861
#define LIBYUV_VERSION 1862
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
"xmm7", "xmm8", "xmm9");
}
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
#if defined(HAS_TRANSPOSE4X4_32_SSE2)
// 4 values, little endian view
// a b c d
// e f g h
// i j k l
// m n o p
// transpose 2x2
// a e b f from row 0, 1
// i m j n from row 2, 3
// c g d h from row 0, 1
// k o l p from row 2, 3
// transpose 4x4
// a e i m from row 0, 1
// b f j n from row 0, 1
// c g k o from row 2, 3
// d h l p from row 2, 3
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_SSE2(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
// Main loop transpose 4x4. Read a column, write a row.
"1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d
"movdqu (%0,%3),%%xmm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2
"movdqu (%0),%%xmm2 \n" // i j k l
"movdqu (%0,%3),%%xmm3 \n" // m n o p
"lea (%0,%3,2),%0 \n" // src += stride * 2
// Transpose 2x2
"movdqa %%xmm0,%%xmm4 \n"
"movdqa %%xmm2,%%xmm5 \n"
"movdqa %%xmm0,%%xmm6 \n"
"movdqa %%xmm2,%%xmm7 \n"
"punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1
"punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3
"punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1
"punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3
// Transpose 4x4
"movdqa %%xmm4,%%xmm0 \n"
"movdqa %%xmm4,%%xmm1 \n"
"movdqa %%xmm6,%%xmm2 \n"
"movdqa %%xmm6,%%xmm3 \n"
"punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1
"punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1
"punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3
"punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3
"movdqu %%xmm0,(%1) \n"
"lea 16(%1,%4),%1 \n" // dst += stride + 16
"movdqu %%xmm1,-16(%1) \n"
"movdqu %%xmm2,-16(%1,%4) \n"
"movdqu %%xmm3,-16(%1,%4,2) \n"
"sub %4,%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+rm"(width) // %2
: "r"((ptrdiff_t)(src_stride)), // %3
"r"((ptrdiff_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // defined(HAS_TRANSPOSE4X4_32_SSE2)
#if defined(HAS_TRANSPOSE4X4_32_AVX2)
// Transpose 32 bit values (ARGB)
void Transpose4x4_32_AVX2(const uint8_t* src,
int src_stride,
uint8_t* dst,
int dst_stride,
int width) {
asm volatile(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d
"vmovdqu (%0,%3),%%xmm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2
"vmovdqu (%0),%%xmm2 \n" // i j k l
"vmovdqu (%0,%3),%%xmm3 \n" // m n o p
"lea (%0,%3,2),%0 \n" // src += stride * 2
"vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d
"vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h
"lea (%0,%3,2),%0 \n" // src += stride * 2
"vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l
"vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p
"lea (%0,%3,2),%0 \n" // src += stride * 2
// Transpose 2x2
"vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1
"vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3
"vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1
"vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3
// Transpose 4x4
"vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1
"vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1
"vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3
"vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3
"vmovdqu %%ymm0,(%1) \n"
"lea 32(%1,%4),%1 \n" // dst += stride + 32
"vmovdqu %%ymm1,-32(%1) \n"
"vmovdqu %%ymm2,-32(%1,%4) \n"
"vmovdqu %%ymm3,-32(%1,%4,2) \n"
"sub %4,%1 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+rm"(width) // %2
: "r"((ptrdiff_t)(src_stride)), // %3
"r"((ptrdiff_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif // defined(HAS_TRANSPOSE4X4_32_AVX2)
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus

View File

@ -864,7 +864,55 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
#if defined(ENABLE_ROW_TESTS)
TEST_F(LibYUVRotateTest, Transpose4x4) {
TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
// dst width and height
const int width = 4;
const int height = 4;
int src_pixels[4][4];
int dst_pixels_c[4][4];
int dst_pixels_opt[4][4];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
src_pixels[i][j] = i * 10 + j;
}
}
memset(dst_pixels_c, 1, width * height * 4);
memset(dst_pixels_opt, 2, width * height * 4);
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_c, width * 4, width);
const int benchmark_iterations =
(benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
(4 * 4);
for (int i = 0; i < benchmark_iterations; ++i) {
#if defined(HAS_TRANSPOSE4X4_32_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else
#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else
#endif
{
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
}
}
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
}
}
}
TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
// dst width and height
const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
const int height = 4;
@ -874,29 +922,35 @@ TEST_F(LibYUVRotateTest, Transpose4x4) {
MemRandomize(src_pixels, height * width * 4);
memset(dst_pixels_c, 1, width * height * 4);
memset(dst_pixels_opt, 1, width * height * 4);
memset(dst_pixels_opt, 2, width * height * 4);
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_c, width * 4, width);
for (int i = 0; i < benchmark_iterations_; ++i) {
#if defined(__aarch64__)
#if defined(HAS_TRANSPOSE4X4_32_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else {
} else
#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else if (TestCpuFlag(kCpuHasSSE2)) {
Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
} else
#endif
{
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
}
#else
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
(uint8_t*)dst_pixels_opt, width * 4, width);
#endif
}
// for (int i = 0; i < width * height; ++i) {
// EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
// }
for (int i = 0; i < width * height; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_c);