fix bugs; optimize performance

2025-12-06 16:56:45 +08:00 · 2018-12-12 18:49:10 +08:00 · 2018-12-12 18:49:10 +08:00 · c40dddcc06
commit c40dddcc06
parent 29d25e2226
4 changed files with 88 additions and 13 deletions
--- a/include/rw_lock.h
+++ b/include/rw_lock.h
@ -2,25 +2,91 @@

 #include <atomic>
 #include <thread>
+#include <chrono>
 #include <limits>

+////////////////////////////////////////////////////////////////
+/// Gives hint to processor that improves performance of spin-wait loops.
+////////////////////////////////////////////////////////////////
+
+#pragma push_macro("IPC_LOCK_PAUSE_")
+#undef  IPC_LOCK_PAUSE_
+
+#if defined(_MSC_VER)
+#include <windows.h>    // YieldProcessor
+/*
+    See: http://msdn.microsoft.com/en-us/library/windows/desktop/ms687419(v=vs.85).aspx
+    Not for intel c++ compiler, so ignore http://software.intel.com/en-us/forums/topic/296168
+*/
+#   define IPC_LOCK_PAUSE_() YieldProcessor()
+#elif defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
+/*
+    See: Intel(R) 64 and IA-32 Architectures Software Developer's Manual V2
+         PAUSE-Spin Loop Hint, 4-57
+         http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.html?wapkw=instruction+set+reference
+*/
+#   define IPC_LOCK_PAUSE_() __asm__ __volatile__("pause")
+#elif defined(__ia64__) || defined(__ia64)
+/*
+    See: Intel(R) Itanium(R) Architecture Developer's Manual, Vol.3
+         hint - Performance Hint, 3:145
+         http://www.intel.com/content/www/us/en/processors/itanium/itanium-architecture-vol-3-manual.html
+*/
+#   define IPC_LOCK_PAUSE_() __asm__ __volatile__ ("hint @pause")
+#elif defined(__arm__)
+/*
+    See: ARM Architecture Reference Manuals (YIELD)
+         http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.subset.architecture.reference/index.html
+*/
+#   define IPC_LOCK_PAUSE_() __asm__ __volatile__ ("yield")
+#endif
+#endif/*compilers*/
+
+#if !defined(IPC_LOCK_PAUSE_)
+/*
+    Just use a compiler fence, prevent compiler from optimizing loop
+*/
+#   define IPC_LOCK_PAUSE_() std::atomic_signal_fence(std::memory_order_seq_cst)
+#endif/*!defined(IPC_LOCK_PAUSE_)*/
+
+////////////////////////////////////////////////////////////////
+/// Yield to other threads
+////////////////////////////////////////////////////////////////
+
+namespace ipc {
+
+inline void yield(unsigned k) {
+    if (k < 4)  { /* Do nothing */ }
+    else
+    if (k < 16) { IPC_LOCK_PAUSE_(); }
+    else
+    if (k < 32) { std::this_thread::yield(); }
+    else
+    { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+}
+
+} // namespace ipc
+
+#pragma pop_macro("IPC_LOCK_PAUSE_")
+
 namespace ipc {

 class rw_lock {
    std::atomic_size_t lc_ { 0 };

    enum : std::size_t {
-        w_flag = std::numeric_limits<std::size_t>::max()
+        w_flag = (std::numeric_limits<std::size_t>::max)()
    };

 public:
    void lock() {
-        while (1) {
+        for (unsigned k = 0;; ++k) {
            std::size_t expected = 0;
            if (lc_.compare_exchange_weak(expected, w_flag, std::memory_order_acq_rel)) {
                break;
            }
-            std::this_thread::yield();
+            yield(k);
        }
    }

@ -29,14 +95,14 @@ public:
    }

    void lock_shared() {
-        while(1) {
+        for (unsigned k = 0;; ++k) {
            std::size_t old = lc_.load(std::memory_order_relaxed);
            std::size_t unlocked = old + 1;
            if (unlocked &&
                lc_.compare_exchange_weak(old, unlocked, std::memory_order_acq_rel)) {
                break;
            }
-            std::this_thread::yield();
+            yield(k);
            std::atomic_thread_fence(std::memory_order_acquire);
        }
    }
--- a/src/ipc.cpp
+++ b/src/ipc.cpp
@ -133,7 +133,7 @@ std::vector<byte_t> recv(handle_t h) {
            cache.resize(last_size + remain);
            std::memcpy(cache.data() + last_size, msg.data_, remain);
            // finish this message, erase it from cache
-            auto ret { std::move(cache) };
+            auto ret = std::move(cache);
            all.erase(msg.id_);
            return ret;
        }
--- a/test/test.h
+++ b/test/test.h
@ -3,6 +3,7 @@
 #include <QtTest>

 #include <iostream>
+#include <atomic>

 #include "stopwatch.hpp"

--- a/test/test_ipc.cpp
+++ b/test/test_ipc.cpp
@ -35,11 +35,11 @@ constexpr T acc(T b, T e) {

 template <typename Mutex>
 struct lc_wrapper : Mutex {
-    void lock_shared  () { lock  (); }
-    void unlock_shared() { unlock(); }
+    void lock_shared  () { Mutex::lock  (); }
+    void unlock_shared() { Mutex::unlock(); }
 };

-template <typename Lc, int Loops = 100000, int R = 4, int W = 4>
+template <typename Lc, int R = 4, int W = 4, int Loops = 100000>
 void benchmark() {
    std::thread r_trd[R];
    std::thread w_trd[W];
@ -99,11 +99,19 @@ void benchmark() {
    for (auto& t : r_trd) t.join();
 }

+template <int R, int W>
+void test_performance() {
+    benchmark<ipc::rw_lock               , R, W>();
+    benchmark<lc_wrapper<capo::spin_lock>, R, W>();
+    benchmark<lc_wrapper<std::mutex>     , R, W>();
+    benchmark<std::shared_mutex          , R, W>();
+}
+
 void Unit::test_rw_lock() {
-    benchmark<ipc::rw_lock>();
-    benchmark<lc_wrapper<capo::spin_lock>>();
-    benchmark<lc_wrapper<std::mutex>>();
-    benchmark<std::shared_mutex>();
+    test_performance<1, 1>();
+    test_performance<4, 4>();
+    test_performance<1, 8>();
+    test_performance<8, 1>();
 }

 void Unit::test_send_recv() {