Implement MCS Combining lock (#666)

* Added lambda lock primitive * Implement MCS Combining lock This is hybrid of Flat Combining and the MCS queue lock. It uses the queue like the MCS queue lock, but each item additionally contains a thunk to perform the body of the lock. This enables other threads to perform the work than initially issued the request. * Add a fast path flag This update adds a fast path flag for the uncontended case. This reduces the number of atomic operations in the uncontended case. * CR feedback
2024-06-28 15:43:17 +01:00 · 2024-06-28 15:43:17 +01:00 · 6af38acd94
--- a/src/snmalloc/backend/globalconfig.h
+++ b/src/snmalloc/backend/globalconfig.h
@ -96,33 +96,37 @@ namespace snmalloc
    // of allocators.
    SNMALLOC_SLOW_PATH static void ensure_init_slow()
    {
-      FlagLock lock{initialisation_lock};
-#ifdef SNMALLOC_TRACING
-      message<1024>("Run init_impl");
-#endif
-
      if (initialised)
        return;

-      LocalEntropy entropy;
-      entropy.init<Pal>();
-      // Initialise key for remote deallocation lists
-      RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key());
+      with(initialisation_lock, [&]() {
+#ifdef SNMALLOC_TRACING
+        message<1024>("Run init_impl");
+#endif

-      // Need to randomise pagemap location. If requested and not a
-      // StrictProvenance architecture, randomize its table's location within a
-      // significantly larger address space allocation.
-      static constexpr bool pagemap_randomize =
-        mitigations(random_pagemap) && !aal_supports<StrictProvenance>;
+        if (initialised)
+          return;

-      Pagemap::concretePagemap.template init<pagemap_randomize>();
+        LocalEntropy entropy;
+        entropy.init<Pal>();
+        // Initialise key for remote deallocation lists
+        RemoteAllocator::key_global = FreeListKey(entropy.get_free_list_key());

-      if constexpr (aal_supports<StrictProvenance>)
-      {
-        Authmap::init();
-      }
+        // Need to randomise pagemap location. If requested and not a
+        // StrictProvenance architecture, randomize its table's location within
+        // a significantly larger address space allocation.
+        static constexpr bool pagemap_randomize =
+          mitigations(random_pagemap) && !aal_supports<StrictProvenance>;

-      initialised.store(true, std::memory_order_release);
+        Pagemap::concretePagemap.template init<pagemap_randomize>();
+
+        if constexpr (aal_supports<StrictProvenance>)
+        {
+          Authmap::init();
+        }
+
+        initialised.store(true, std::memory_order_release);
+      });
    }

  public:
--- a/src/snmalloc/backend_helpers/lockrange.h
+++ b/src/snmalloc/backend_helpers/lockrange.h
@ -22,7 +22,7 @@ namespace snmalloc
       * This is infrequently used code, a spin lock simplifies the code
       * considerably, and should never be on the fast path.
       */
-      FlagWord spin_lock{};
+      CombiningLock spin_lock{};

    public:
      static constexpr bool Aligned = ParentRange::Aligned;
@ -35,14 +35,18 @@ namespace snmalloc

      CapPtr<void, ChunkBounds> alloc_range(size_t size)
      {
-        FlagLock lock(spin_lock);
-        return parent.alloc_range(size);
+        CapPtr<void, ChunkBounds> result;
+        with(spin_lock, [&]() {
+          {
+            result = parent.alloc_range(size);
+          }
+        });
+        return result;
      }

      void dealloc_range(CapPtr<void, ChunkBounds> base, size_t size)
      {
-        FlagLock lock(spin_lock);
-        parent.dealloc_range(base, size);
+        with(spin_lock, [&]() { parent.dealloc_range(base, size); });
      }
    };
  };
--- a/src/snmalloc/ds/combininglock.h
+++ b/src/snmalloc/ds/combininglock.h
@ -0,0 +1,224 @@
+#pragma once
+
+#include "../aal/aal.h"
+#include "../pal/pal.h"
+
+#include <atomic>
+#include <functional>
+
+namespace snmalloc
+{
+  class CombineLockNode;
+
+  struct CombiningLock
+  {
+    // Fast path lock incase there is no contention.
+    std::atomic<bool> flag{false};
+
+    // MCS queue of work items
+    std::atomic<CombineLockNode*> head{nullptr};
+  };
+
+  /**
+   * @brief Combinations of MCS queue lock with Flat Combining
+   *
+   * Each element in the queue has a pointer to a work item.
+   * This means when under contention the thread holding the lock
+   * can perform the work.
+   *
+   * As the work items are arbitrary lambdas there are no simplifications
+   * for combining related work items.  I.e. original Flat Combining paper
+   * might sort a collection of inserts, and perform them in a single traversal.
+   *
+   * Note that, we should perhaps add a Futex/WakeOnAddress mode to improve
+   * performance in the contended case, rather than spinning.
+   */
+  class CombineLockNode
+  {
+    template<typename F>
+    friend class CombineLockNodeTempl;
+
+    enum class LockStatus
+    {
+      // The work for this node has not been completed.
+      WAITING,
+
+      // The work for this thread has been completed, and it is not the
+      // last element in the queue.
+      DONE,
+
+      // The work for this thread has not been completed, and it is the
+      // head of the queue.
+      READY
+    };
+
+    // Status of the queue, set by the thread at the head of the queue,
+    // When it makes the thread for this node either the head of the queue
+    // or completes its work.
+    std::atomic<LockStatus> status{LockStatus::WAITING};
+
+    // Used to store the queue
+    std::atomic<CombineLockNode*> next{nullptr};
+
+    // Stores the C++ lambda associated with this node in the queue.
+    void (*f_raw)(CombineLockNode*);
+
+    void release(CombiningLock& lock)
+    {
+      lock.flag.store(false, std::memory_order_release);
+    }
+
+    void set_status(LockStatus s)
+    {
+      status.store(s, std::memory_order_release);
+    }
+
+    constexpr CombineLockNode(void (*f)(CombineLockNode*)) : f_raw(f) {}
+
+    SNMALLOC_FAST_PATH void attach(CombiningLock& lock)
+    {
+      // Test if no one is waiting
+      if (lock.head.load(std::memory_order_relaxed) == nullptr)
+      {
+        // No one was waiting so low contention. Attempt to acquire the flag
+        // lock.
+        if (lock.flag.exchange(true, std::memory_order_acquire) == false)
+        {
+          // We grabbed the lock.
+          f_raw(this);
+
+          // Release the lock
+          release(lock);
+          return;
+        }
+      }
+      attach_slow(lock);
+    }
+
+    SNMALLOC_SLOW_PATH void attach_slow(CombiningLock& lock)
+    {
+      // There is contention for the lock, we need to add our work to the
+      // queue of pending work
+      auto prev = lock.head.exchange(this, std::memory_order_acq_rel);
+
+      if (prev != nullptr)
+      {
+        // If we aren't the head, link into predecessor
+        prev->next.store(this, std::memory_order_release);
+
+        // Wait to for predecessor to complete
+        while (status.load(std::memory_order_relaxed) == LockStatus::WAITING)
+          Aal::pause();
+
+        // Determine if another thread completed our work.
+        if (status.load(std::memory_order_acquire) == LockStatus::DONE)
+          return;
+      }
+      else
+      {
+        // We are the head of the queue. Spin until we acquire the fast path
+        // lock.  As we are in the queue future requests shouldn't try to
+        // acquire the fast path lock, but stale views of the queue being empty
+        // could still be concurrent with this thread.
+        while (lock.flag.exchange(true, std::memory_order_acquire))
+        {
+          while (lock.flag.load(std::memory_order_relaxed))
+          {
+            Aal::pause();
+          }
+        }
+
+        // We could set
+        //    status = LockStatus::Ready
+        // However, the subsequent state assumes it is Ready, and
+        // nothing would read it.
+      }
+
+      // We are the head of the queue, and responsible for
+      // waking/performing our and subsequent work.
+      auto curr = this;
+      while (true)
+      {
+        // Perform work for head of the queue
+        curr->f_raw(curr);
+
+        // Determine if there are more elements.
+        auto n = curr->next.load(std::memory_order_acquire);
+        if (n != nullptr)
+        {
+          // Signal this work was completed and move on to
+          // next item.
+          curr->set_status(LockStatus::DONE);
+          curr = n;
+          continue;
+        }
+
+        // This could be the end of the queue, attempt to close the
+        // queue.
+        auto curr_c = curr;
+        if (lock.head.compare_exchange_strong(
+              curr_c,
+              nullptr,
+              std::memory_order_release,
+              std::memory_order_relaxed))
+        {
+          // Queue was successfully closed.
+          // Notify last element the work was completed.
+          curr->set_status(LockStatus::DONE);
+          release(lock);
+          return;
+        }
+
+        // Failed to close the queue wait for next thread to be
+        // added.
+        while (curr->next.load(std::memory_order_relaxed) == nullptr)
+          Aal::pause();
+
+        // As we had to wait, give the job to the next thread
+        // to carry on performing the work.
+        n = curr->next.load(std::memory_order_acquire);
+        n->set_status(LockStatus::READY);
+
+        // Notify the thread that we completed its work.
+        // Note that this needs to be done last, as we can't read
+        // curr->next after setting curr->status
+        curr->set_status(LockStatus::DONE);
+        return;
+      }
+    }
+  };
+
+  template<typename F>
+  class CombineLockNodeTempl : CombineLockNode
+  {
+    template<typename FF>
+    friend void with(CombiningLock&, FF&&);
+
+    // This holds the closure for the lambda
+    F f;
+
+    // Untyped version of calling f to store in the node.
+    static void invoke(CombineLockNode* self)
+    {
+      auto self_templ = reinterpret_cast<CombineLockNodeTempl*>(self);
+      self_templ->f();
+    }
+
+    CombineLockNodeTempl(CombiningLock& lock, F&& f_)
+    : CombineLockNode(invoke), f(f_)
+    {
+      attach(lock);
+    }
+  };
+
+  /**
+   * Lock primitive. This takes a reference to a Lock, and a thunk to
+   * call when the lock is available.  The thunk should be independent of
+   * the current thread as the thunk may be executed by a different thread.
+   */
+  template<typename F>
+  inline void with(CombiningLock& lock, F&& f)
+  {
+    CombineLockNodeTempl<F> node{lock, std::forward<F>(f)};
+  }
+} // namespace snmalloc
--- a/src/snmalloc/ds/ds.h
+++ b/src/snmalloc/ds/ds.h
@ -6,6 +6,7 @@
 #include "../pal/pal.h"
 #include "aba.h"
 #include "allocconfig.h"
+#include "combininglock.h"
 #include "entropy.h"
 #include "flaglock.h"
 #include "mpmcstack.h"
--- a/src/snmalloc/ds/flaglock.h
+++ b/src/snmalloc/ds/flaglock.h
@ -133,4 +133,11 @@ namespace snmalloc
      lock.flag.store(false, std::memory_order_release);
    }
  };
+
+  template<typename F>
+  inline void with(FlagWord& lock, F&& f)
+  {
+    FlagLock l(lock);
+    f();
+  }
 } // namespace snmalloc
--- a/src/snmalloc/ds/singleton.h
+++ b/src/snmalloc/ds/singleton.h
@ -35,14 +35,15 @@ namespace snmalloc

      if (SNMALLOC_UNLIKELY(!initialised.load(std::memory_order_acquire)))
      {
-        FlagLock lock(flag);
-        if (!initialised)
-        {
-          init(&obj);
-          initialised.store(true, std::memory_order_release);
-          if (first != nullptr)
-            *first = true;
-        }
+        with(flag, [&]() {
+          if (!initialised)
+          {
+            init(&obj);
+            initialised.store(true, std::memory_order_release);
+            if (first != nullptr)
+              *first = true;
+          }
+        });
      }
      return obj;
    }
--- a/src/snmalloc/mem/pool.h
+++ b/src/snmalloc/mem/pool.h
@ -100,8 +100,9 @@ namespace snmalloc
    static T* acquire()
    {
      PoolState<T>& pool = get_state();
-      {
-        FlagLock f(pool.lock);
+
+      T* result{nullptr};
+      with(pool.lock, [&]() {
        if (pool.front != nullptr)
        {
          auto p = pool.front;
@ -112,17 +113,21 @@ namespace snmalloc
          }
          pool.front = next;
          p->set_in_use();
-          return p.unsafe_ptr();
+          result = p.unsafe_ptr();
        }
-      }
+      });
+
+      if (result != nullptr)
+        return result;

      auto p = ConstructT::make();

-      FlagLock f(pool.lock);
-      p->list_next = pool.list;
-      pool.list = p;
+      with(pool.lock, [&]() {
+        p->list_next = pool.list;
+        pool.list = p;

-      p->set_in_use();
+        p->set_in_use();
+      });
      return p.unsafe_ptr();
    }

@ -146,11 +151,13 @@ namespace snmalloc
      // Returns a linked list of all objects in the stack, emptying the stack.
      if (p == nullptr)
      {
-        FlagLock f(pool.lock);
-        auto result = pool.front;
-        pool.front = nullptr;
-        pool.back = nullptr;
-        return result.unsafe_ptr();
+        T* result;
+        with(pool.lock, [&]() {
+          result = pool.front.unsafe_ptr();
+          pool.front = nullptr;
+          pool.back = nullptr;
+        });
+        return result;
      }

      return p->next.unsafe_ptr();
@ -165,18 +172,18 @@ namespace snmalloc
    {
      PoolState<T>& pool = get_state();
      last->next = nullptr;
-      FlagLock f(pool.lock);
+      with(pool.lock, [&]() {
+        if (pool.front == nullptr)
+        {
+          pool.front = capptr::Alloc<T>::unsafe_from(first);
+        }
+        else
+        {
+          pool.back->next = capptr::Alloc<T>::unsafe_from(first);
+        }

-      if (pool.front == nullptr)
-      {
-        pool.front = capptr::Alloc<T>::unsafe_from(first);
-      }
-      else
-      {
-        pool.back->next = capptr::Alloc<T>::unsafe_from(first);
-      }
-
-      pool.back = capptr::Alloc<T>::unsafe_from(last);
+        pool.back = capptr::Alloc<T>::unsafe_from(last);
+      });
    }

    /**
@ -188,18 +195,19 @@ namespace snmalloc
    {
      PoolState<T>& pool = get_state();
      last->next = nullptr;
-      FlagLock f(pool.lock);

-      if (pool.front == nullptr)
-      {
-        pool.back = capptr::Alloc<T>::unsafe_from(last);
-      }
-      else
-      {
-        last->next = pool.front;
-        pool.back->next = capptr::Alloc<T>::unsafe_from(first);
-      }
-      pool.front = capptr::Alloc<T>::unsafe_from(first);
+      with(pool.lock, [&]() {
+        if (pool.front == nullptr)
+        {
+          pool.back = capptr::Alloc<T>::unsafe_from(last);
+        }
+        else
+        {
+          last->next = pool.front;
+          pool.back->next = capptr::Alloc<T>::unsafe_from(first);
+        }
+        pool.front = capptr::Alloc<T>::unsafe_from(first);
+      });
    }

    static T* iterate(T* p = nullptr)