diff --git a/include/circ_elem_array.h b/include/circ_elem_array.h
index 9ab4f59..95c86be 100644
--- a/include/circ_elem_array.h
+++ b/include/circ_elem_array.h
@@ -38,8 +38,8 @@ struct alignas(std::max_align_t) elem_array_head {
     }
 
     auto acquire(void) {
-        while (lc_.exchange(1, std::memory_order_acquire)) {
-            std::this_thread::yield();
+        for (unsigned k = 0; lc_.exchange(1, std::memory_order_acquire); ++k) {
+            yield(k);
         }
         return index_of(wt_.load(std::memory_order_relaxed));
     }
@@ -107,7 +107,7 @@ public:
     void* acquire(void) {
         elem_t* el = elem(base_t::acquire());
         // check all consumers have finished reading
-        while(1) {
+        for (unsigned k = 0;; ++k) {
             uint_t<32> expected = 0;
             if (el->head_.rc_.compare_exchange_weak(
                         expected,
@@ -115,7 +115,7 @@ public:
                         std::memory_order_release)) {
                 break;
             }
-            std::this_thread::yield();
+            yield(k);
         }
         return el->data_;
     }
diff --git a/include/def.h b/include/def.h
index 4fd1730..111e7e2 100644
--- a/include/def.h
+++ b/include/def.h
@@ -3,6 +3,8 @@
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <thread>
+#include <chrono>
 
 namespace ipc {
 
@@ -28,3 +30,68 @@ enum : std::size_t {
 };
 
 } // namespace ipc
+
+////////////////////////////////////////////////////////////////
+/// Gives hint to processor that improves performance of spin-wait loops.
+////////////////////////////////////////////////////////////////
+
+#pragma push_macro("IPC_LOCK_PAUSE_")
+#undef  IPC_LOCK_PAUSE_
+
+#if defined(_MSC_VER)
+#include <windows.h>    // YieldProcessor
+/*
+    See: http://msdn.microsoft.com/en-us/library/windows/desktop/ms687419(v=vs.85).aspx
+    Not for intel c++ compiler, so ignore http://software.intel.com/en-us/forums/topic/296168
+*/
+#   define IPC_LOCK_PAUSE_() YieldProcessor()
+#elif defined(__GNUC__)
+#if defined(__i386__) || defined(__x86_64__)
+/*
+    See: Intel(R) 64 and IA-32 Architectures Software Developer's Manual V2
+         PAUSE-Spin Loop Hint, 4-57
+         http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.html?wapkw=instruction+set+reference
+*/
+#   define IPC_LOCK_PAUSE_() __asm__ __volatile__("pause")
+#elif defined(__ia64__) || defined(__ia64)
+/*
+    See: Intel(R) Itanium(R) Architecture Developer's Manual, Vol.3
+         hint - Performance Hint, 3:145
+         http://www.intel.com/content/www/us/en/processors/itanium/itanium-architecture-vol-3-manual.html
+*/
+#   define IPC_LOCK_PAUSE_() __asm__ __volatile__ ("hint @pause")
+#elif defined(__arm__)
+/*
+    See: ARM Architecture Reference Manuals (YIELD)
+         http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.subset.architecture.reference/index.html
+*/
+#   define IPC_LOCK_PAUSE_() __asm__ __volatile__ ("yield")
+#endif
+#endif/*compilers*/
+
+#if !defined(IPC_LOCK_PAUSE_)
+/*
+    Just use a compiler fence, prevent compiler from optimizing loop
+*/
+#   define IPC_LOCK_PAUSE_() std::atomic_signal_fence(std::memory_order_seq_cst)
+#endif/*!defined(IPC_LOCK_PAUSE_)*/
+
+////////////////////////////////////////////////////////////////
+/// Yield to other threads
+////////////////////////////////////////////////////////////////
+
+namespace ipc {
+
+inline void yield(unsigned k) {
+    if (k < 4)  { /* Do nothing */ }
+    else
+    if (k < 16) { IPC_LOCK_PAUSE_(); }
+    else
+    if (k < 32) { std::this_thread::yield(); }
+    else
+    { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+}
+
+} // namespace ipc
+
+#pragma pop_macro("IPC_LOCK_PAUSE_")
diff --git a/include/rw_lock.h b/include/rw_lock.h
index b382e88..3290337 100644
--- a/include/rw_lock.h
+++ b/include/rw_lock.h
@@ -1,74 +1,9 @@
 #pragma once
 
 #include <atomic>
-#include <thread>
-#include <chrono>
 #include <limits>
 
-////////////////////////////////////////////////////////////////
-/// Gives hint to processor that improves performance of spin-wait loops.
-////////////////////////////////////////////////////////////////
-
-#pragma push_macro("IPC_LOCK_PAUSE_")
-#undef  IPC_LOCK_PAUSE_
-
-#if defined(_MSC_VER)
-#include <windows.h>    // YieldProcessor
-/*
-    See: http://msdn.microsoft.com/en-us/library/windows/desktop/ms687419(v=vs.85).aspx
-    Not for intel c++ compiler, so ignore http://software.intel.com/en-us/forums/topic/296168
-*/
-#   define IPC_LOCK_PAUSE_() YieldProcessor()
-#elif defined(__GNUC__)
-#if defined(__i386__) || defined(__x86_64__)
-/*
-    See: Intel(R) 64 and IA-32 Architectures Software Developer's Manual V2
-         PAUSE-Spin Loop Hint, 4-57
-         http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.html?wapkw=instruction+set+reference
-*/
-#   define IPC_LOCK_PAUSE_() __asm__ __volatile__("pause")
-#elif defined(__ia64__) || defined(__ia64)
-/*
-    See: Intel(R) Itanium(R) Architecture Developer's Manual, Vol.3
-         hint - Performance Hint, 3:145
-         http://www.intel.com/content/www/us/en/processors/itanium/itanium-architecture-vol-3-manual.html
-*/
-#   define IPC_LOCK_PAUSE_() __asm__ __volatile__ ("hint @pause")
-#elif defined(__arm__)
-/*
-    See: ARM Architecture Reference Manuals (YIELD)
-         http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.subset.architecture.reference/index.html
-*/
-#   define IPC_LOCK_PAUSE_() __asm__ __volatile__ ("yield")
-#endif
-#endif/*compilers*/
-
-#if !defined(IPC_LOCK_PAUSE_)
-/*
-    Just use a compiler fence, prevent compiler from optimizing loop
-*/
-#   define IPC_LOCK_PAUSE_() std::atomic_signal_fence(std::memory_order_seq_cst)
-#endif/*!defined(IPC_LOCK_PAUSE_)*/
-
-////////////////////////////////////////////////////////////////
-/// Yield to other threads
-////////////////////////////////////////////////////////////////
-
-namespace ipc {
-
-inline void yield(unsigned k) {
-    if (k < 4)  { /* Do nothing */ }
-    else
-    if (k < 16) { IPC_LOCK_PAUSE_(); }
-    else
-    if (k < 32) { std::this_thread::yield(); }
-    else
-    { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
-}
-
-} // namespace ipc
-
-#pragma pop_macro("IPC_LOCK_PAUSE_")
+#include "def.h"
 
 namespace ipc {
 
diff --git a/test/test_circ.cpp b/test/test_circ.cpp
index 61cc68a..b42d2a5 100644
--- a/test/test_circ.cpp
+++ b/test/test_circ.cpp
@@ -31,7 +31,7 @@ private slots:
     void test_prod_cons_performance();
 
     void test_queue();
-} /*unit__*/;
+} unit__;
 
 #include "test_circ.moc"
 
diff --git a/test/test_ipc.cpp b/test/test_ipc.cpp
index ce40e11..fe47bd7 100644
--- a/test/test_ipc.cpp
+++ b/test/test_ipc.cpp
@@ -44,10 +44,10 @@ struct lc_wrapper : Mutex {
     void unlock_shared() { Mutex::unlock(); }
 };
 
-template <typename Lc, int R = 4, int W = 4, int Loops = 100000>
+template <typename Lc, int W = 4, int R = 4, int Loops = 100000>
 void benchmark() {
-    std::thread r_trd[R];
     std::thread w_trd[W];
+    std::thread r_trd[R];
     std::atomic_int fini { 0 };
 
     std::vector<int> datas;
@@ -86,7 +86,7 @@ void benchmark() {
                 std::this_thread::yield();
             }
             if (++fini == std::extent<decltype(r_trd)>::value) {
-                sw.print_elapsed(R, W, Loops);
+                sw.print_elapsed(W, R, Loops);
             }
             std::int64_t sum = 0;
             for (int i : seq) sum += i;
@@ -114,17 +114,17 @@ void benchmark() {
     for (auto& t : r_trd) t.join();
 }
 
-template <int R, int W>
+template <int W, int R>
 void test_performance() {
 
     std::cout << std::endl
-              << "test_performance: [" << R << ":" << W << "]"
+              << "test_performance: [" << W << ":" << R << "]"
               << std::endl;
 
-    benchmark<ipc::rw_lock               , R, W>();
-    benchmark<lc_wrapper<capo::spin_lock>, R, W>();
-    benchmark<lc_wrapper<std::mutex>     , R, W>();
-    benchmark<std::shared_timed_mutex    , R, W>();
+    benchmark<ipc::rw_lock               , W, R>();
+    benchmark<lc_wrapper<capo::spin_lock>, W, R>();
+    benchmark<lc_wrapper<std::mutex>     , W, R>();
+    benchmark<std::shared_timed_mutex    , W, R>();
 }
 
 void Unit::test_rw_lock() {