diff --git a/include/rw_lock.h b/include/rw_lock.h
index 43d08b8..0565f78 100644
--- a/include/rw_lock.h
+++ b/include/rw_lock.h
@@ -14,7 +14,7 @@ class rw_lock {
     };
 
 public:
-    void lock(void) {
+    void lock() {
         while (1) {
             std::size_t expected = 0;
             if (lc_.compare_exchange_weak(expected, w_flag, std::memory_order_acq_rel)) {
@@ -24,11 +24,11 @@ public:
         }
     }
 
-    void unlock(void) {
+    void unlock() {
         lc_.store(0, std::memory_order_release);
     }
 
-    void lock_shared(void) {
+    void lock_shared() {
         while(1) {
             std::size_t old = lc_.load(std::memory_order_relaxed);
             std::size_t unlocked = old + 1;
@@ -41,7 +41,7 @@ public:
         }
     }
 
-    void unlock_shared(void) {
+    void unlock_shared() {
         lc_.fetch_sub(1, std::memory_order_release);
     }
 };
diff --git a/src/ipc.cpp b/src/ipc.cpp
index e5904ca..fd1de2d 100644
--- a/src/ipc.cpp
+++ b/src/ipc.cpp
@@ -1,3 +1,5 @@
+#include "ipc.h"
+
 #include <unordered_map>
 #include <memory>
 #include <type_traits>
@@ -6,7 +8,6 @@
 #include <algorithm>
 #include <utility>
 
-#include "ipc.h"
 #include "circ_queue.h"
 #include "rw_lock.h"
 
diff --git a/src/platform/shm_linux.cpp b/src/platform/shm_linux.cpp
index c64c568..d048fc4 100644
--- a/src/platform/shm_linux.cpp
+++ b/src/platform/shm_linux.cpp
@@ -1,11 +1,11 @@
+#include "shm.h"
+
 #include <sys/shm.h>
 #include <sys/stat.h>
 #include <sys/mman.h>
 #include <unistd.h>
 #include <fcntl.h>
 
-#include "shm.h"
-
 namespace ipc {
 namespace shm {
 
diff --git a/src/platform/shm_win.cpp b/src/platform/shm_win.cpp
index a2eda46..ee29e53 100644
--- a/src/platform/shm_win.cpp
+++ b/src/platform/shm_win.cpp
@@ -1,3 +1,5 @@
+#include "shm.h"
+
 #include <windows.h>
 
 #include <type_traits>
@@ -6,8 +8,6 @@
 #include <codecvt>
 #include <utility>
 
-#include "shm.h"
-
 namespace {
 
 template <typename T, typename S, typename R = S>
diff --git a/src/shm.cpp b/src/shm.cpp
index 688c91c..eec3b24 100644
--- a/src/shm.cpp
+++ b/src/shm.cpp
@@ -1,8 +1,8 @@
+#include "shm.h"
+
 #include <string>
 #include <utility>
 
-#include "shm.h"
-
 namespace ipc {
 namespace shm {
 
diff --git a/test/spin_lock.hpp b/test/spin_lock.hpp
new file mode 100644
index 0000000..8ca2635
--- /dev/null
+++ b/test/spin_lock.hpp
@@ -0,0 +1,104 @@
+/*
+    The Capo Library
+    Code covered by the MIT License
+
+    Author: mutouyun (http://orzz.org)
+*/
+
+#pragma once
+
+#include <atomic>       // std::atomic_flag, std::atomic_signal_fence
+#include <thread>       // std::this_thread
+#include <chrono>       // std::chrono::milliseconds
+#if defined(_MSC_VER)
+#include <windows.h>    // YieldProcessor
+#endif/*_MSC_VER*/
+
+namespace capo {
+namespace detail_spin_lock {
+
+////////////////////////////////////////////////////////////////
+/// Gives hint to processor that improves performance of spin-wait loops.
+////////////////////////////////////////////////////////////////
+
+#if defined(_MSC_VER)
+/*
+    See: http://msdn.microsoft.com/en-us/library/windows/desktop/ms687419(v=vs.85).aspx
+    Not for intel c++ compiler, so ignore http://software.intel.com/en-us/forums/topic/296168
+*/
+#   define CAPO_SPIN_LOCK_PAUSE_() YieldProcessor()
+#elif defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
+/*
+    See: Intel(R) 64 and IA-32 Architectures Software Developer's Manual V2
+         PAUSE-Spin Loop Hint, 4-57
+         http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.html?wapkw=instruction+set+reference
+*/
+#   define CAPO_SPIN_LOCK_PAUSE_() __asm__ __volatile__("pause")
+#elif defined(__ia64__) || defined(__ia64)
+/*
+    See: Intel(R) Itanium(R) Architecture Developer's Manual, Vol.3
+         hint - Performance Hint, 3:145
+         http://www.intel.com/content/www/us/en/processors/itanium/itanium-architecture-vol-3-manual.html
+*/
+#   define CAPO_SPIN_LOCK_PAUSE_() __asm__ __volatile__ ("hint @pause")
+#elif defined(__arm__)
+/*
+    See: ARM Architecture Reference Manuals (YIELD)
+         http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.subset.architecture.reference/index.html
+*/
+#   define CAPO_SPIN_LOCK_PAUSE_() __asm__ __volatile__ ("yield")
+#endif
+#endif/*compilers*/
+
+#if !defined(CAPO_SPIN_LOCK_PAUSE_)
+/*
+    Just use a compiler fence, prevent compiler from optimizing loop
+*/
+#   define CAPO_SPIN_LOCK_PAUSE_() std::atomic_signal_fence(std::memory_order_seq_cst)
+#endif/*!defined(CAPO_SPIN_LOCK_PAUSE_)*/
+
+////////////////////////////////////////////////////////////////
+/// Yield to other threads
+////////////////////////////////////////////////////////////////
+
+inline void yield(unsigned k)
+{
+    if (k < 4)  { /* Do nothing */ }
+    else
+    if (k < 16) { CAPO_SPIN_LOCK_PAUSE_(); }
+    else
+    if (k < 32) { std::this_thread::yield(); }
+    else
+    { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+}
+
+} // namespace detail_spin_lock
+
+////////////////////////////////////////////////////////////////
+/// Spinlock
+////////////////////////////////////////////////////////////////
+
+class spin_lock
+{
+    std::atomic_flag lc_ = ATOMIC_FLAG_INIT;
+
+public:
+    bool try_lock(void)
+    {
+        return !lc_.test_and_set(std::memory_order_acquire);
+    }
+
+    void lock(void)
+    {
+        for (unsigned k = 0; lc_.test_and_set(std::memory_order_acquire); ++k)
+            detail_spin_lock::yield(k);
+    }
+
+    void unlock(void)
+    {
+        lc_.clear(std::memory_order_release);
+    }
+};
+
+} // namespace capo
diff --git a/test/test.h b/test/test.h
index 5fb915d..8952186 100644
--- a/test/test.h
+++ b/test/test.h
@@ -2,6 +2,10 @@
 
 #include <QtTest>
 
+#include <iostream>
+
+#include "stopwatch.hpp"
+
 class TestSuite : public QObject
 {
     Q_OBJECT
@@ -15,3 +19,21 @@ protected:
 protected slots:
     virtual void initTestCase();
 };
+
+struct test_stopwatch {
+    capo::stopwatch<> sw_;
+    std::atomic_flag started_ = ATOMIC_FLAG_INIT;
+
+    void start() {
+        if (!started_.test_and_set()) {
+            sw_.start();
+        }
+    }
+
+    void print_elapsed(int N, int M, int Loops) {
+        auto ts = sw_.elapsed<std::chrono::microseconds>();
+        std::cout << "[" << N << ":" << M << ", " << Loops << "]" << std::endl
+                  << "performance: " << (ts / 1000.0) << " ms, "
+                  << (double(ts) / double(Loops * N)) << " us/d" << std::endl;
+    }
+};
diff --git a/test/test_circ.cpp b/test/test_circ.cpp
index 53208db..61cc68a 100644
--- a/test/test_circ.cpp
+++ b/test/test_circ.cpp
@@ -9,7 +9,6 @@
 
 #include "circ_elem_array.h"
 #include "circ_queue.h"
-#include "stopwatch.hpp"
 #include "test.h"
 
 namespace {
@@ -73,23 +72,6 @@ struct msg_t {
     int dat_;
 };
 
-struct test_stopwatch {
-    capo::stopwatch<> sw_;
-    std::atomic_flag started_ = ATOMIC_FLAG_INIT;
-
-    void start() {
-        if (!started_.test_and_set()) {
-            sw_.start();
-        }
-    }
-
-    void print_elapsed(int N, int M, int Loops) {
-        auto ts = sw_.elapsed<std::chrono::microseconds>();
-        std::cout << "[" << N << ":" << M << ", " << Loops << "]" << std::endl
-                  << "performance: " << (double(ts) / double(Loops * N)) << " us/d" << std::endl;
-    }
-};
-
 template <bool V>
 struct test_verify {
     std::unordered_map<int, std::vector<int>>* list_;
diff --git a/test/test_ipc.cpp b/test/test_ipc.cpp
index eecc1ca..4311fe7 100644
--- a/test/test_ipc.cpp
+++ b/test/test_ipc.cpp
@@ -4,9 +4,12 @@
 #include <iostream>
 #include <shared_mutex>
 #include <mutex>
+#include <typeinfo>
 
 #include "ipc.h"
 #include "rw_lock.h"
+#include "stopwatch.hpp"
+#include "spin_lock.hpp"
 #include "test.h"
 
 namespace {
@@ -25,12 +28,28 @@ private slots:
 
 #include "test_ipc.moc"
 
-void Unit::test_rw_lock() {
-    std::thread r_trd[4];
-    std::thread w_trd[4];
+template <typename T>
+constexpr T acc(T b, T e) {
+    return (e + b) * (e - b + 1) / 2;
+}
+
+template <typename Mutex>
+struct lc_wrapper : Mutex {
+    void lock_shared  () { lock  (); }
+    void unlock_shared() { unlock(); }
+};
+
+template <typename Lc, int Loops = 100000, int R = 4, int W = 4>
+void benchmark() {
+    std::thread r_trd[R];
+    std::thread w_trd[W];
+    std::atomic_int fini { 0 };
 
     std::vector<int> datas;
-    ipc::rw_lock lc;
+    Lc lc;
+
+    test_stopwatch sw;
+    std::cout << std::endl << typeid(Lc).name() << std::endl;
 
     for (auto& t : r_trd) {
         t = std::thread([&] {
@@ -39,7 +58,7 @@ void Unit::test_rw_lock() {
             while (1) {
                 int x = -1;
                 {
-                    [[maybe_unused]] std::shared_lock<ipc::rw_lock> guard { lc };
+                    [[maybe_unused]] std::shared_lock<Lc> guard { lc };
                     if (cnt < datas.size()) {
                         x = datas[cnt];
                     }
@@ -51,21 +70,23 @@ void Unit::test_rw_lock() {
                 }
                 std::this_thread::yield();
             }
-            std::size_t sum = 0;
-            for (int i : seq) {
-                sum += static_cast<std::size_t>(i);
+            if (++fini == std::extent<decltype(r_trd)>::value) {
+                sw.print_elapsed(R, W, Loops);
             }
-            std::cout << std::endl;
-            QCOMPARE(sum, 5050 * std::extent<decltype(w_trd)>::value);
+            std::uint64_t sum = 0;
+            for (int i : seq) sum += i;
+            QCOMPARE(sum, acc<std::uint64_t>(1, Loops) * std::extent<decltype(w_trd)>::value);
         });
     }
 
     for (auto& t : w_trd) {
         t = std::thread([&] {
-            for (int i = 1; i <= 100; ++i) {
-                lc.lock();
-                datas.push_back(i);
-                lc.unlock();
+            sw.start();
+            for (int i = 1; i <= Loops; ++i) {
+                {
+                    [[maybe_unused]] std::unique_lock<Lc> guard { lc };
+                    datas.push_back(i);
+                }
                 std::this_thread::yield();
             }
         });
@@ -75,13 +96,14 @@ void Unit::test_rw_lock() {
     lc.lock();
     datas.push_back(0);
     lc.unlock();
-
     for (auto& t : r_trd) t.join();
+}
 
-    for (int i : datas) {
-        std::cout << i << " ";
-    }
-    std::cout << std::endl;
+void Unit::test_rw_lock() {
+    benchmark<ipc::rw_lock>();
+    benchmark<lc_wrapper<capo::spin_lock>>();
+    benchmark<lc_wrapper<std::mutex>>();
+    benchmark<std::shared_mutex>();
 }
 
 void Unit::test_send_recv() {