diff --git a/include/def.h b/include/def.h
index 75c673f..a8a5a47 100644
--- a/include/def.h
+++ b/include/def.h
@@ -17,6 +17,7 @@ struct uint;
 template <> struct uint<8 > { using type = std::uint8_t ; };
 template <> struct uint<16> { using type = std::uint16_t; };
 template <> struct uint<32> { using type = std::uint32_t; };
+template <> struct uint<64> { using type = std::uint64_t; };
 
 template <std::size_t N>
 using uint_t = typename uint<N>::type;
diff --git a/include/rw_lock.h b/include/rw_lock.h
index 3290337..28bb99d 100644
--- a/include/rw_lock.h
+++ b/include/rw_lock.h
@@ -2,12 +2,13 @@
 
 #include <atomic>
 #include <limits>
+#include <type_traits>
 
 #include "def.h"
 
 namespace ipc {
 
-class rw_lock {
+class rw_cas_lock {
     std::atomic_size_t lc_ { 0 };
 
     enum : std::size_t {
@@ -18,7 +19,7 @@ public:
     void lock() {
         for (unsigned k = 0;; ++k) {
             std::size_t expected = 0;
-            if (lc_.compare_exchange_weak(expected, w_flag, std::memory_order_acq_rel)) {
+            if (lc_.compare_exchange_weak(expected, w_flag, std::memory_order_acquire)) {
                 break;
             }
             yield(k);
@@ -34,7 +35,52 @@ public:
             std::size_t old = lc_.load(std::memory_order_relaxed);
             std::size_t unlocked = old + 1;
             if (unlocked &&
-                lc_.compare_exchange_weak(old, unlocked, std::memory_order_acq_rel)) {
+                lc_.compare_exchange_weak(old, unlocked, std::memory_order_acquire)) {
+                break;
+            }
+            yield(k);
+            std::atomic_thread_fence(std::memory_order_acquire);
+        }
+    }
+
+    void unlock_shared() {
+        lc_.fetch_sub(1, std::memory_order_release);
+    }
+};
+
+class rw_lock {
+    using lc_ui_t = std::size_t;
+    std::atomic<lc_ui_t> lc_ { 0 };
+
+    enum : lc_ui_t {
+        w_mask = (std::numeric_limits<std::make_signed_t<lc_ui_t>>::max)(), // b 0111 1111
+        w_flag = w_mask + 1                                                 // b 1000 0000
+    };
+
+public:
+    void lock() {
+        auto old = lc_.fetch_or(w_flag, std::memory_order_acquire);
+        if (!old) return;
+        // just like a spin-lock
+        if (old & w_flag) for (unsigned k = 1; lc_.fetch_or(w_flag, std::memory_order_acquire) & w_flag; ++k) {
+            yield(k);
+        }
+        // wait for reading finished
+        else for (unsigned k = 1; lc_.load(std::memory_order_acquire) & w_mask; ++k) {
+            yield(k);
+        }
+    }
+
+    void unlock() {
+        lc_.fetch_and(w_mask, std::memory_order_release);
+    }
+
+    void lock_shared() {
+        for (unsigned k = 0;; ++k) {
+            auto old = lc_.load(std::memory_order_relaxed);
+            // if w_flag set, just continue; otherwise cas ++
+            if (!(old & w_flag) &&
+                lc_.compare_exchange_weak(old, old + 1, std::memory_order_acquire)) {
                 break;
             }
             yield(k);
diff --git a/test/test_circ.cpp b/test/test_circ.cpp
index b42d2a5..61cc68a 100644
--- a/test/test_circ.cpp
+++ b/test/test_circ.cpp
@@ -31,7 +31,7 @@ private slots:
     void test_prod_cons_performance();
 
     void test_queue();
-} unit__;
+} /*unit__*/;
 
 #include "test_circ.moc"
 
diff --git a/test/test_ipc.cpp b/test/test_ipc.cpp
index fe47bd7..01853e0 100644
--- a/test/test_ipc.cpp
+++ b/test/test_ipc.cpp
@@ -101,6 +101,7 @@ void benchmark() {
                 {
                     std::unique_lock<Lc> guard { lc };
                     datas.push_back(i);
+                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
                 }
                 std::this_thread::yield();
             }
@@ -122,16 +123,17 @@ void test_performance() {
               << std::endl;
 
     benchmark<ipc::rw_lock               , W, R>();
-    benchmark<lc_wrapper<capo::spin_lock>, W, R>();
-    benchmark<lc_wrapper<std::mutex>     , W, R>();
-    benchmark<std::shared_timed_mutex    , W, R>();
+//    benchmark<ipc::rw_cas_lock           , W, R>();
+//    benchmark<lc_wrapper<capo::spin_lock>, W, R>();
+//    benchmark<lc_wrapper<std::mutex>     , W, R>();
+//    benchmark<std::shared_timed_mutex    , W, R>();
 }
 
 void Unit::test_rw_lock() {
-    test_performance<1, 1>();
-    test_performance<4, 4>();
-    test_performance<1, 8>();
-    test_performance<8, 1>();
+    test_performance<2, 1>();
+//    test_performance<4, 4>();
+//    test_performance<1, 8>();
+//    test_performance<8, 1>();
 }
 
 void Unit::test_send_recv() {