Transpose 4x4 for SSE2 and AVX2

Skylake Xeon AVX2 Transpose4x4_Opt (290 ms) SSE2 Transpose4x4_Opt (302 ms) C Transpose4x4_Opt (522 ms) AMD Zen2 AVX2 Transpose4x4_Opt (136 ms) SSE2 Transpose4x4_Opt (137 ms) C Transpose4x4_Opt (431 ms) Bug: None Change-Id: I4997dbd5c5387c22bfd6c5960b421504e4bc8a2a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4292946 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2023-02-27 01:23:59 -08:00 · 2023-02-27 01:23:59 -08:00 · f9b23b9cc0
commit f9b23b9cc0
parent e66f436560
5 changed files with 216 additions and 20 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1861
+Version: 1862
 License: BSD
 License File: LICENSE

--- a/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@ -42,6 +42,8 @@ extern "C" {
 // The following are available for GCC 32 or 64 bit:
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
+#define HAS_TRANSPOSE4X4_32_SSE2
+#define HAS_TRANSPOSE4X4_32_AVX2
 #endif

 // The following are available for 64 bit GCC:
@ -56,6 +58,11 @@ extern "C" {
 #define HAS_TRANSPOSEUVWX8_NEON
 #endif

+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(LIBYUV_NEON) || defined(__aarch64__))
+#define HAS_TRANSPOSE4X4_32_NEON
+#endif
+
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_TRANSPOSEWX16_MSA
 #define HAS_TRANSPOSEUVWX16_MSA
@ -240,19 +247,24 @@ void Transpose4x4_32_NEON(const uint8_t* src,
                          int dst_stride,
                          int width);

+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width);
+
 void Transpose4x4_32_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst,
                       int dst_stride,
                       int width);

-// Transpose 32 bit values (ARGB)
-void Transpose8x8_32_NEON(const uint8_t* src,
-                          int src_stride,
-                          uint8_t* dst,
-                          int dst_stride,
-                          int width);
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1861
+#define LIBYUV_VERSION 1862

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/rotate_gcc.cc
+++ b/source/rotate_gcc.cc
@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
        "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_SSE2)
+// 4 values, little endian view
+// a b c d
+// e f g h
+// i j k l
+// m n o p
+
+// transpose 2x2
+// a e b f   from row 0, 1
+// i m j n   from row 2, 3
+// c g d h   from row 0, 1
+// k o l p   from row 2, 3
+
+// transpose 4x4
+// a e i m   from row 0, 1
+// b f j n   from row 0, 1
+// c g k o   from row 2, 3
+// d h l p   from row 2, 3
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_SSE2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm0                   \n"  // a b c d
+      "movdqu      (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "movdqu      (%0),%%xmm2                   \n"  // i j k l
+      "movdqu      (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "movdqa      %%xmm0,%%xmm4                 \n"
+      "movdqa      %%xmm2,%%xmm5                 \n"
+      "movdqa      %%xmm0,%%xmm6                 \n"
+      "movdqa      %%xmm2,%%xmm7                 \n"
+      "punpckldq   %%xmm1,%%xmm4                 \n"  // a e b f   from row 0, 1
+      "punpckldq   %%xmm3,%%xmm5                 \n"  // i m j n   from row 2, 3
+      "punpckhdq   %%xmm1,%%xmm6                 \n"  // c g d h   from row 0, 1
+      "punpckhdq   %%xmm3,%%xmm7                 \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "movdqa      %%xmm4,%%xmm0                 \n"
+      "movdqa      %%xmm4,%%xmm1                 \n"
+      "movdqa      %%xmm6,%%xmm2                 \n"
+      "movdqa      %%xmm6,%%xmm3                 \n"
+      "punpcklqdq  %%xmm5,%%xmm0                 \n"  // a e i m   from row 0, 1
+      "punpckhqdq  %%xmm5,%%xmm1                 \n"  // b f j n   from row 0, 1
+      "punpcklqdq  %%xmm7,%%xmm2                 \n"  // c g k o   from row 2, 3
+      "punpckhqdq  %%xmm7,%%xmm3                 \n"  // d h l p   from row 2, 3
+
+      "movdqu      %%xmm0,(%1)                   \n"
+      "lea         16(%1,%4),%1                  \n"  // dst += stride + 16
+      "movdqu      %%xmm1,-16(%1)                \n"
+      "movdqu      %%xmm2,-16(%1,%4)             \n"
+      "movdqu      %%xmm3,-16(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x4,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_SSE2)
+
+#if defined(HAS_TRANSPOSE4X4_32_AVX2)
+
+// Transpose 32 bit values (ARGB)
+void Transpose4x4_32_AVX2(const uint8_t* src,
+                          int src_stride,
+                          uint8_t* dst,
+                          int dst_stride,
+                          int width) {
+  asm volatile(
+      // Main loop transpose 2 blocks of 4x4.  Read a column, write a row.
+      "1:                                        \n"
+      "vmovdqu     (%0),%%xmm0                   \n"  // a b c d
+      "vmovdqu     (%0,%3),%%xmm1                \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vmovdqu     (%0),%%xmm2                   \n"  // i j k l
+      "vmovdqu     (%0,%3),%%xmm3                \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      "vinserti128 $1,(%0),%%ymm0,%%ymm0         \n"  // a b c d
+      "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1      \n"  // e f g h
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+      "vinserti128 $1,(%0),%%ymm2,%%ymm2         \n"  // i j k l
+      "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3      \n"  // m n o p
+      "lea         (%0,%3,2),%0                  \n"  // src += stride * 2
+
+      // Transpose 2x2
+      "vpunpckldq  %%ymm1,%%ymm0,%%ymm4          \n"  // a e b f   from row 0, 1
+      "vpunpckldq  %%ymm3,%%ymm2,%%ymm5          \n"  // i m j n   from row 2, 3
+      "vpunpckhdq  %%ymm1,%%ymm0,%%ymm6          \n"  // c g d h   from row 0, 1
+      "vpunpckhdq  %%ymm3,%%ymm2,%%ymm7          \n"  // k o l p   from row 2, 3
+
+      // Transpose 4x4
+      "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0          \n"  // a e i m   from row 0, 1
+      "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1          \n"  // b f j n   from row 0, 1
+      "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2          \n"  // c g k o   from row 2, 3
+      "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3          \n"  // d h l p   from row 2, 3
+
+      "vmovdqu     %%ymm0,(%1)                   \n"
+      "lea         32(%1,%4),%1                  \n"  // dst += stride + 32
+      "vmovdqu     %%ymm1,-32(%1)                \n"
+      "vmovdqu     %%ymm2,-32(%1,%4)             \n"
+      "vmovdqu     %%ymm3,-32(%1,%4,2)           \n"
+      "sub         %4,%1                         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+rm"(width)                   // %2
+      : "r"((ptrdiff_t)(src_stride)),  // %3
+        "r"((ptrdiff_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // defined(HAS_TRANSPOSE4X4_32_AVX2)
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@ -864,7 +864,55 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {

 #if defined(ENABLE_ROW_TESTS)

-TEST_F(LibYUVRotateTest, Transpose4x4) {
+TEST_F(LibYUVRotateTest, Transpose4x4_Test) {
+  // dst width and height
+  const int width = 4;
+  const int height = 4;
+  int src_pixels[4][4];
+  int dst_pixels_c[4][4];
+  int dst_pixels_opt[4][4];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      src_pixels[i][j] = i * 10 + j;
+    }
+  }
+  memset(dst_pixels_c, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);
+
+  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                    (uint8_t*)dst_pixels_c, width * 4, width);
+
+  const int benchmark_iterations =
+      (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) /
+      (4 * 4);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
+    if (TestCpuFlag(kCpuHasNEON)) {
+      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_SSE2)
+    if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
+      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
+                        (uint8_t*)dst_pixels_opt, width * 4, width);
+    }
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]);
+      EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]);
+    }
+  }
+}
+
+TEST_F(LibYUVRotateTest, Transpose4x4_Opt) {
  // dst width and height
  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
  const int height = 4;
@ -874,29 +922,35 @@ TEST_F(LibYUVRotateTest, Transpose4x4) {

  MemRandomize(src_pixels, height * width * 4);
  memset(dst_pixels_c, 1, width * height * 4);
-  memset(dst_pixels_opt, 1, width * height * 4);
+  memset(dst_pixels_opt, 2, width * height * 4);

  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                    (uint8_t*)dst_pixels_c, width * 4, width);

  for (int i = 0; i < benchmark_iterations_; ++i) {
-#if defined(__aarch64__)
+#if defined(HAS_TRANSPOSE4X4_32_NEON)
    if (TestCpuFlag(kCpuHasNEON)) {
      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
-    } else {
+    } else
+#elif defined(HAS_TRANSPOSE4X4_32_AVX2)
+    if (TestCpuFlag(kCpuHasAVX2)) {
+      Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else if (TestCpuFlag(kCpuHasSSE2)) {
+      Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4,
+                           (uint8_t*)dst_pixels_opt, width * 4, width);
+    } else
+#endif
+    {
      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                        (uint8_t*)dst_pixels_opt, width * 4, width);
    }
-#else
-    Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
-                      (uint8_t*)dst_pixels_opt, width * 4, width);
-#endif
  }

-  //  for (int i = 0; i < width * height; ++i) {
-  //    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
-  //  }
+  for (int i = 0; i < width * height; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }

  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_c);