Transpose 4x4 for SSE2 and AVX2

Skylake Xeon AVX2 Transpose4x4_Opt (290 ms) SSE2 Transpose4x4_Opt (302 ms) C Transpose4x4_Opt (522 ms) AMD Zen2 AVX2 Transpose4x4_Opt (136 ms) SSE2 Transpose4x4_Opt (137 ms) C Transpose4x4_Opt (431 ms) Bug: None Change-Id: I4997dbd5c5387c22bfd6c5960b421504e4bc8a2a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4292946 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-02-16 07:09:53 +08:00 · 2023-02-27 01:23:59 -08:00 · 2023-02-27 01:23:59 -08:00 · f9b23b9cc0
commit f9b23b9cc0
parent e66f436560
5 changed files with 216 additions and 20 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1861
+Version: 1862
 License: BSD
 License File: LICENSE
--- a/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@ -42,6 +42,8 @@ extern "C" {
 // The following are available for GCC 32 or 64 bit:
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSE4X4_32_SSE2
 #define HAS_TRANSPOSE4X4_32_AVX2
 #endif
 // The following are available for 64 bit GCC:
@ -56,6 +58,11 @@ extern "C" {
 #define HAS_TRANSPOSEUVWX8_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_NEON) && \
    (defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSE4X4_32_NEON
 #endif
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_TRANSPOSEWX16_MSA
 #define HAS_TRANSPOSEUVWX16_MSA
@ -240,19 +247,24 @@ void Transpose4x4_32_NEON(const uint8_t* src,
                          int dst_stride,
                          int width);
 void Transpose4x4_32_SSE2(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width);
 void Transpose4x4_32_AVX2(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width);
 void Transpose4x4_32_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst,
                       int dst_stride,
                       int width);
 // Transpose 32 bit values (ARGB)
 void Transpose8x8_32_NEON(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1861
+#define LIBYUV_VERSION 1862
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
        "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
 #if defined(HAS_TRANSPOSE4X4_32_SSE2)
 // 4 values, little endian view
 // a b c d
 // e f g h
 // i j k l
 // m n o p
 // transpose 2x2
 // a e b f   from row 0, 1
 // i m j n   from row 2, 3
 // c g d h   from row 0, 1
 // k o l p   from row 2, 3
 // transpose 4x4
 // a e i m   from row 0, 1
 // b f j n   from row 0, 1
 // c g k o   from row 2, 3
 // d h l p   from row 2, 3
 // Transpose 32 bit values (ARGB)
 void Transpose4x4_32_SSE2(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
  asm volatile(
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "movdqu      (%0),%%xmm0                   \n"  // a b c d
      "movdqu      (%0,%3),%%xmm1                \n"  // e f g h
      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
      "movdqu      (%0),%%xmm2                   \n"  // i j k l
      "movdqu      (%0,%3),%%xmm3                \n"  // m n o p
      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
      // Transpose 2x2
      "movdqa      %%xmm0,%%xmm4                 \n"
      "movdqa      %%xmm2,%%xmm5                 \n"
      "movdqa      %%xmm0,%%xmm6                 \n"
      "movdqa      %%xmm2,%%xmm7                 \n"
      "punpckldq   %%xmm1,%%xmm4                 \n"  // a e b f   from row 0, 1
      "punpckldq   %%xmm3,%%xmm5                 \n"  // i m j n   from row 2, 3
      "punpckhdq   %%xmm1,%%xmm6                 \n"  // c g d h   from row 0, 1
      "punpckhdq   %%xmm3,%%xmm7                 \n"  // k o l p   from row 2, 3
      // Transpose 4x4
      "movdqa      %%xmm4,%%xmm0                 \n"
      "movdqa      %%xmm4,%%xmm1                 \n"
      "movdqa      %%xmm6,%%xmm2                 \n"
      "movdqa      %%xmm6,%%xmm3                 \n"
      "punpcklqdq  %%xmm5,%%xmm0                 \n"  // a e i m   from row 0, 1
      "punpckhqdq  %%xmm5,%%xmm1                 \n"  // b f j n   from row 0, 1
      "punpcklqdq  %%xmm7,%%xmm2                 \n"  // c g k o   from row 2, 3
      "punpckhqdq  %%xmm7,%%xmm3                 \n"  // d h l p   from row 2, 3
      "movdqu      %%xmm0,(%1)                   \n"
      "lea         16(%1,%4),%1                  \n"  // dst += stride + 16
      "movdqu      %%xmm1,-16(%1)                \n"
      "movdqu      %%xmm2,-16(%1,%4)             \n"
      "movdqu      %%xmm3,-16(%1,%4,2)           \n"
      "sub         %4,%1                         \n"
      "sub         $0x4,%2                       \n"
      "jg          1b                            \n"
      : "+r"(src),                     // %0
        "+r"(dst),                     // %1
        "+rm"(width)                   // %2
      : "r"((ptrdiff_t)(src_stride)),  // %3
        "r"((ptrdiff_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
 #endif  // defined(HAS_TRANSPOSE4X4_32_SSE2)
 #if defined(HAS_TRANSPOSE4X4_32_AVX2)
 // Transpose 32 bit values (ARGB)
 void Transpose4x4_32_AVX2(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
  asm volatile(
      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
      "1:                                        \n"
      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
      "vmovdqu     (%0,%3),%%xmm1                \n"  // e f g h
      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
      "vmovdqu     (%0),%%xmm2                   \n"  // i j k l
      "vmovdqu     (%0,%3),%%xmm3                \n"  // m n o p
      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // a b c d
      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // e f g h
      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
      "vinserti128 $1,(%0),%%ymm2,%%ymm2         \n"  // i j k l
      "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3      \n"  // m n o p
      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
      // Transpose 2x2
      "vpunpckldq  %%ymm1,%%ymm0,%%ymm4          \n"  // a e b f   from row 0, 1
      "vpunpckldq  %%ymm3,%%ymm2,%%ymm5          \n"  // i m j n   from row 2, 3
      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm6          \n"  // c g d h   from row 0, 1
      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm7          \n"  // k o l p   from row 2, 3
      // Transpose 4x4
      "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0          \n"  // a e i m   from row 0, 1
      "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1          \n"  // b f j n   from row 0, 1
      "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2          \n"  // c g k o   from row 2, 3
      "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3          \n"  // d h l p   from row 2, 3
      "vmovdqu     %%ymm0,(%1)                   \n"
      "lea         32(%1,%4),%1                  \n"  // dst += stride + 32
      "vmovdqu     %%ymm1,-32(%1)                \n"
      "vmovdqu     %%ymm2,-32(%1,%4)             \n"
      "vmovdqu     %%ymm3,-32(%1,%4,2)           \n"
      "sub         %4,%1                         \n"
      "sub         $0x8,%2                       \n"
      "jg          1b                            \n"
      "vzeroupper                                \n"
      : "+r"(src),                     // %0
        "+r"(dst),                     // %1
        "+rm"(width)                   // %2
      : "r"((ptrdiff_t)(src_stride)),  // %3
        "r"((ptrdiff_t)(dst_stride))   // %4
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
        "xmm7");
 }
 #endif  // defined(HAS_TRANSPOSE4X4_32_AVX2)
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@ -864,7 +864,55 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
 #if defined(ENABLE_ROW_TESTS)
-TEST_F(LibYUVRotateTest, Transpose4x4) {
+TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
  // dst width and height
  const int width = 4;
  const int height = 4;
  int src_pixels[4][4];
  int dst_pixels_c[4][4];
  int dst_pixels_opt[4][4];
  for (int i = 0; i < 4; ++i) {
    for (int j = 0; j < 4; ++j) {
      src_pixels[i][j] = i * 10 + j;
    }
  }
  memset(dst_pixels_c, 1, width * height * 4);
  memset(dst_pixels_opt, 2, width * height * 4);
  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                    (uint8_t*)dst_pixels_c, width * 4, width);
  const int benchmark_iterations =
      (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
      (4 * 4);
  for (int i = 0; i < benchmark_iterations; ++i) {
 #if defined(HAS_TRANSPOSE4X4_32_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
    } else
 #elif defined(HAS_TRANSPOSE4X4_32_SSE2)
    if (TestCpuFlag(kCpuHasSSE2)) {
      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
    } else
 #endif
    {
      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                        (uint8_t*)dst_pixels_opt, width * 4, width);
    }
  }
  for (int i = 0; i < 4; ++i) {
    for (int j = 0; j < 4; ++j) {
      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
    }
  }
 }
 TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
  // dst width and height
  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
  const int height = 4;
@ -874,29 +922,35 @@ TEST_F(LibYUVRotateTest, Transpose4x4) {
  MemRandomize(src_pixels, height * width * 4);
  memset(dst_pixels_c, 1, width * height * 4);
-  memset(dst_pixels_opt, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                    (uint8_t*)dst_pixels_c, width * 4, width);
  for (int i = 0; i < benchmark_iterations_; ++i) {
-#if defined(__aarch64__)
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
-    } else {
+    } else
 #elif defined(HAS_TRANSPOSE4X4_32_AVX2)
    if (TestCpuFlag(kCpuHasAVX2)) {
      Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
    } else if (TestCpuFlag(kCpuHasSSE2)) {
      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
    } else
 #endif
    {
      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                        (uint8_t*)dst_pixels_opt, width * 4, width);
    }
 #else
    Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                      (uint8_t*)dst_pixels_opt, width * 4, width);
 #endif
  }
-  //  for (int i = 0; i < width * height; ++i) {
+  for (int i = 0; i < width * height; ++i) {
-  //    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-  //  }
+  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_c);