mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
Transpose 4x4 for SSE2 and AVX2
Skylake Xeon AVX2 Transpose4x4_Opt (290 ms) SSE2 Transpose4x4_Opt (302 ms) C Transpose4x4_Opt (522 ms) AMD Zen2 AVX2 Transpose4x4_Opt (136 ms) SSE2 Transpose4x4_Opt (137 ms) C Transpose4x4_Opt (431 ms) Bug: None Change-Id: I4997dbd5c5387c22bfd6c5960b421504e4bc8a2a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4292946 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
e66f436560
commit
f9b23b9cc0
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1861
|
Version: 1862
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -42,6 +42,8 @@ extern "C" {
|
|||||||
// The following are available for GCC 32 or 64 bit:
|
// The following are available for GCC 32 or 64 bit:
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
|
#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
|
||||||
#define HAS_TRANSPOSEWX8_SSSE3
|
#define HAS_TRANSPOSEWX8_SSSE3
|
||||||
|
#define HAS_TRANSPOSE4X4_32_SSE2
|
||||||
|
#define HAS_TRANSPOSE4X4_32_AVX2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following are available for 64 bit GCC:
|
// The following are available for 64 bit GCC:
|
||||||
@ -56,6 +58,11 @@ extern "C" {
|
|||||||
#define HAS_TRANSPOSEUVWX8_NEON
|
#define HAS_TRANSPOSEUVWX8_NEON
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||||
|
(defined(LIBYUV_NEON) || defined(__aarch64__))
|
||||||
|
#define HAS_TRANSPOSE4X4_32_NEON
|
||||||
|
#endif
|
||||||
|
|
||||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||||
#define HAS_TRANSPOSEWX16_MSA
|
#define HAS_TRANSPOSEWX16_MSA
|
||||||
#define HAS_TRANSPOSEUVWX16_MSA
|
#define HAS_TRANSPOSEUVWX16_MSA
|
||||||
@ -240,19 +247,24 @@ void Transpose4x4_32_NEON(const uint8_t* src,
|
|||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
|
void Transpose4x4_32_SSE2(const uint8_t* src,
|
||||||
|
int src_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int dst_stride,
|
||||||
|
int width);
|
||||||
|
|
||||||
|
void Transpose4x4_32_AVX2(const uint8_t* src,
|
||||||
|
int src_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int dst_stride,
|
||||||
|
int width);
|
||||||
|
|
||||||
void Transpose4x4_32_C(const uint8_t* src,
|
void Transpose4x4_32_C(const uint8_t* src,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
uint8_t* dst,
|
uint8_t* dst,
|
||||||
int dst_stride,
|
int dst_stride,
|
||||||
int width);
|
int width);
|
||||||
|
|
||||||
// Transpose 32 bit values (ARGB)
|
|
||||||
void Transpose8x8_32_NEON(const uint8_t* src,
|
|
||||||
int src_stride,
|
|
||||||
uint8_t* dst,
|
|
||||||
int dst_stride,
|
|
||||||
int width);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1861
|
#define LIBYUV_VERSION 1862
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
|
|||||||
"xmm7", "xmm8", "xmm9");
|
"xmm7", "xmm8", "xmm9");
|
||||||
}
|
}
|
||||||
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
|
#endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
|
||||||
|
|
||||||
|
#if defined(HAS_TRANSPOSE4X4_32_SSE2)
|
||||||
|
// 4 values, little endian view
|
||||||
|
// a b c d
|
||||||
|
// e f g h
|
||||||
|
// i j k l
|
||||||
|
// m n o p
|
||||||
|
|
||||||
|
// transpose 2x2
|
||||||
|
// a e b f from row 0, 1
|
||||||
|
// i m j n from row 2, 3
|
||||||
|
// c g d h from row 0, 1
|
||||||
|
// k o l p from row 2, 3
|
||||||
|
|
||||||
|
// transpose 4x4
|
||||||
|
// a e i m from row 0, 1
|
||||||
|
// b f j n from row 0, 1
|
||||||
|
// c g k o from row 2, 3
|
||||||
|
// d h l p from row 2, 3
|
||||||
|
|
||||||
|
// Transpose 32 bit values (ARGB)
|
||||||
|
void Transpose4x4_32_SSE2(const uint8_t* src,
|
||||||
|
int src_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int dst_stride,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
// Main loop transpose 4x4. Read a column, write a row.
|
||||||
|
"1: \n"
|
||||||
|
"movdqu (%0),%%xmm0 \n" // a b c d
|
||||||
|
"movdqu (%0,%3),%%xmm1 \n" // e f g h
|
||||||
|
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||||
|
"movdqu (%0),%%xmm2 \n" // i j k l
|
||||||
|
"movdqu (%0,%3),%%xmm3 \n" // m n o p
|
||||||
|
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||||
|
|
||||||
|
// Transpose 2x2
|
||||||
|
"movdqa %%xmm0,%%xmm4 \n"
|
||||||
|
"movdqa %%xmm2,%%xmm5 \n"
|
||||||
|
"movdqa %%xmm0,%%xmm6 \n"
|
||||||
|
"movdqa %%xmm2,%%xmm7 \n"
|
||||||
|
"punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1
|
||||||
|
"punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3
|
||||||
|
"punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1
|
||||||
|
"punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3
|
||||||
|
|
||||||
|
// Transpose 4x4
|
||||||
|
"movdqa %%xmm4,%%xmm0 \n"
|
||||||
|
"movdqa %%xmm4,%%xmm1 \n"
|
||||||
|
"movdqa %%xmm6,%%xmm2 \n"
|
||||||
|
"movdqa %%xmm6,%%xmm3 \n"
|
||||||
|
"punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1
|
||||||
|
"punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1
|
||||||
|
"punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3
|
||||||
|
"punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3
|
||||||
|
|
||||||
|
"movdqu %%xmm0,(%1) \n"
|
||||||
|
"lea 16(%1,%4),%1 \n" // dst += stride + 16
|
||||||
|
"movdqu %%xmm1,-16(%1) \n"
|
||||||
|
"movdqu %%xmm2,-16(%1,%4) \n"
|
||||||
|
"movdqu %%xmm3,-16(%1,%4,2) \n"
|
||||||
|
"sub %4,%1 \n"
|
||||||
|
"sub $0x4,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+rm"(width) // %2
|
||||||
|
: "r"((ptrdiff_t)(src_stride)), // %3
|
||||||
|
"r"((ptrdiff_t)(dst_stride)) // %4
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
|
"xmm7");
|
||||||
|
}
|
||||||
|
#endif // defined(HAS_TRANSPOSE4X4_32_SSE2)
|
||||||
|
|
||||||
|
#if defined(HAS_TRANSPOSE4X4_32_AVX2)
|
||||||
|
|
||||||
|
// Transpose 32 bit values (ARGB)
|
||||||
|
void Transpose4x4_32_AVX2(const uint8_t* src,
|
||||||
|
int src_stride,
|
||||||
|
uint8_t* dst,
|
||||||
|
int dst_stride,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
// Main loop transpose 2 blocks of 4x4. Read a column, write a row.
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu (%0),%%xmm0 \n" // a b c d
|
||||||
|
"vmovdqu (%0,%3),%%xmm1 \n" // e f g h
|
||||||
|
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||||
|
"vmovdqu (%0),%%xmm2 \n" // i j k l
|
||||||
|
"vmovdqu (%0,%3),%%xmm3 \n" // m n o p
|
||||||
|
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||||
|
|
||||||
|
"vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d
|
||||||
|
"vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h
|
||||||
|
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||||
|
"vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l
|
||||||
|
"vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p
|
||||||
|
"lea (%0,%3,2),%0 \n" // src += stride * 2
|
||||||
|
|
||||||
|
// Transpose 2x2
|
||||||
|
"vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1
|
||||||
|
"vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3
|
||||||
|
"vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1
|
||||||
|
"vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3
|
||||||
|
|
||||||
|
// Transpose 4x4
|
||||||
|
"vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1
|
||||||
|
"vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1
|
||||||
|
"vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3
|
||||||
|
"vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3
|
||||||
|
|
||||||
|
"vmovdqu %%ymm0,(%1) \n"
|
||||||
|
"lea 32(%1,%4),%1 \n" // dst += stride + 32
|
||||||
|
"vmovdqu %%ymm1,-32(%1) \n"
|
||||||
|
"vmovdqu %%ymm2,-32(%1,%4) \n"
|
||||||
|
"vmovdqu %%ymm3,-32(%1,%4,2) \n"
|
||||||
|
"sub %4,%1 \n"
|
||||||
|
"sub $0x8,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+rm"(width) // %2
|
||||||
|
: "r"((ptrdiff_t)(src_stride)), // %3
|
||||||
|
"r"((ptrdiff_t)(dst_stride)) // %4
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||||
|
"xmm7");
|
||||||
|
}
|
||||||
|
#endif // defined(HAS_TRANSPOSE4X4_32_AVX2)
|
||||||
|
|
||||||
#endif // defined(__x86_64__) || defined(__i386__)
|
#endif // defined(__x86_64__) || defined(__i386__)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|||||||
@ -864,7 +864,55 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
|
|||||||
|
|
||||||
#if defined(ENABLE_ROW_TESTS)
|
#if defined(ENABLE_ROW_TESTS)
|
||||||
|
|
||||||
TEST_F(LibYUVRotateTest, Transpose4x4) {
|
TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
|
||||||
|
// dst width and height
|
||||||
|
const int width = 4;
|
||||||
|
const int height = 4;
|
||||||
|
int src_pixels[4][4];
|
||||||
|
int dst_pixels_c[4][4];
|
||||||
|
int dst_pixels_opt[4][4];
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
src_pixels[i][j] = i * 10 + j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
memset(dst_pixels_c, 1, width * height * 4);
|
||||||
|
memset(dst_pixels_opt, 2, width * height * 4);
|
||||||
|
|
||||||
|
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||||
|
(uint8_t*)dst_pixels_c, width * 4, width);
|
||||||
|
|
||||||
|
const int benchmark_iterations =
|
||||||
|
(benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
|
||||||
|
(4 * 4);
|
||||||
|
for (int i = 0; i < benchmark_iterations; ++i) {
|
||||||
|
#if defined(HAS_TRANSPOSE4X4_32_NEON)
|
||||||
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
|
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
|
||||||
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
|
} else
|
||||||
|
#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
|
||||||
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||||
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
|
||||||
|
EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
|
||||||
// dst width and height
|
// dst width and height
|
||||||
const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
|
const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
|
||||||
const int height = 4;
|
const int height = 4;
|
||||||
@ -874,29 +922,35 @@ TEST_F(LibYUVRotateTest, Transpose4x4) {
|
|||||||
|
|
||||||
MemRandomize(src_pixels, height * width * 4);
|
MemRandomize(src_pixels, height * width * 4);
|
||||||
memset(dst_pixels_c, 1, width * height * 4);
|
memset(dst_pixels_c, 1, width * height * 4);
|
||||||
memset(dst_pixels_opt, 1, width * height * 4);
|
memset(dst_pixels_opt, 2, width * height * 4);
|
||||||
|
|
||||||
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||||
(uint8_t*)dst_pixels_c, width * 4, width);
|
(uint8_t*)dst_pixels_c, width * 4, width);
|
||||||
|
|
||||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||||
#if defined(__aarch64__)
|
#if defined(HAS_TRANSPOSE4X4_32_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
|
Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
|
||||||
(uint8_t*)dst_pixels_opt, width * 4, width);
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
} else {
|
} else
|
||||||
|
#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
|
||||||
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
|
Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
|
||||||
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
|
} else if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
|
||||||
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
||||||
(uint8_t*)dst_pixels_opt, width * 4, width);
|
(uint8_t*)dst_pixels_opt, width * 4, width);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
|
|
||||||
(uint8_t*)dst_pixels_opt, width * 4, width);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// for (int i = 0; i < width * height; ++i) {
|
for (int i = 0; i < width * height; ++i) {
|
||||||
// EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||||
// }
|
}
|
||||||
|
|
||||||
free_aligned_buffer_page_end(src_pixels);
|
free_aligned_buffer_page_end(src_pixels);
|
||||||
free_aligned_buffer_page_end(dst_pixels_c);
|
free_aligned_buffer_page_end(dst_pixels_c);
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user