diff --git a/.github/workflows/clangarm64-build.yml b/.github/workflows/clangarm64-build.yml
new file mode 100644
index 0000000000..a826dc5b31
--- /dev/null
+++ b/.github/workflows/clangarm64-build.yml
@@ -0,0 +1,25 @@
+name: CLANG build ARM64
+
+on:
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  clang-build:
+    runs-on: [Windows, ARM64]
+    env:
+      NO_PERL: 1
+    steps:
+      - uses: actions/checkout@v3
+      - uses: git-for-windows/setup-git-for-windows-sdk@v1
+        with:
+          flavor: makepkg-git
+          architecture: aarch64
+          # This assumes that the job is running on a self-hosted runner,
+          # in which case we need to cleanup SDK files.
+          cleanup: true
+      - name: Build Git CLANGARM64
+        run: make -j`nproc`
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9fdbd54028..749704bf00 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -169,8 +169,11 @@ jobs:
       NO_PERL: 1
       GIT_CONFIG_PARAMETERS: "'user.name=CI' 'user.email=ci@git'"
     runs-on: windows-latest
+    strategy:
+      matrix:
+        arch: [x64, arm64]
     concurrency:
-      group: vs-build-${{ github.ref }}
+      group: vs-build-${{ github.ref }}-${{ matrix.arch }}
       cancel-in-progress: ${{ needs.ci-config.outputs.skip_concurrent == 'yes' }}
     steps:
     - uses: actions/checkout@v3
@@ -181,26 +184,22 @@ jobs:
         repository: 'microsoft/vcpkg'
         path: 'compat/vcbuild/vcpkg'
     - name: download vcpkg artifacts
-      shell: powershell
-      run: |
-        $urlbase = "https://dev.azure.com/git/git/_apis/build/builds"
-        $id = ((Invoke-WebRequest -UseBasicParsing "${urlbase}?definitions=9&statusFilter=completed&resultFilter=succeeded&`$top=1").content | ConvertFrom-JSON).value[0].id
-        $downloadUrl = ((Invoke-WebRequest -UseBasicParsing "${urlbase}/$id/artifacts").content | ConvertFrom-JSON).value[0].resource.downloadUrl
-        (New-Object Net.WebClient).DownloadFile($downloadUrl, "compat.zip")
-        Expand-Archive compat.zip -DestinationPath . -Force
-        Remove-Item compat.zip
+      uses: git-for-windows/get-azure-pipelines-artifact@v0
+      with:
+        repository: git/git
+        definitionId: 9
     - name: add msbuild to PATH
       uses: microsoft/setup-msbuild@v1
     - name: copy dlls to root
       shell: cmd
-      run: compat\vcbuild\vcpkg_copy_dlls.bat release
+      run: compat\vcbuild\vcpkg_copy_dlls.bat release ${{ matrix.arch }}-windows
     - name: generate Visual Studio solution
       shell: bash
       run: |
-        cmake `pwd`/contrib/buildsystems/ -DCMAKE_PREFIX_PATH=`pwd`/compat/vcbuild/vcpkg/installed/x64-windows \
-        -DNO_GETTEXT=YesPlease -DPERL_TESTS=OFF -DPYTHON_TESTS=OFF -DCURL_NO_CURL_CMAKE=ON
+        cmake `pwd`/contrib/buildsystems/ -DCMAKE_PREFIX_PATH=`pwd`/compat/vcbuild/vcpkg/installed/${{ matrix.arch }}-windows \
+        -DNO_GETTEXT=YesPlease -DPERL_TESTS=OFF -DPYTHON_TESTS=OFF -DCURL_NO_CURL_CMAKE=ON -DCMAKE_GENERATOR_PLATFORM=${{ matrix.arch }} -DVCPKG_ARCH=${{ matrix.arch }}-windows -DHOST_CPU=${{ matrix.arch }}
     - name: MSBuild
-      run: msbuild git.sln -property:Configuration=Release -property:Platform=x64 -maxCpuCount:4 -property:PlatformToolset=v142
+      run: msbuild git.sln -property:Configuration=Release -property:Platform=${{ matrix.arch }} -maxCpuCount:4 -property:PlatformToolset=v142
     - name: bundle artifact tar
       shell: bash
       env:
@@ -214,7 +213,7 @@ jobs:
     - name: upload tracked files and build artifacts
       uses: actions/upload-artifact@v3
       with:
-        name: vs-artifacts
+        name: vs-artifacts-${{ matrix.arch }}
         path: artifacts
   vs-test:
     name: win+VS test
@@ -232,7 +231,7 @@ jobs:
     - name: download tracked files and build artifacts
       uses: actions/download-artifact@v3
       with:
-        name: vs-artifacts
+        name: vs-artifacts-x64
         path: ${{github.workspace}}
     - name: extract tracked files and build artifacts
       shell: bash
diff --git a/.github/workflows/nano-server.yml b/.github/workflows/nano-server.yml
new file mode 100644
index 0000000000..580725c817
--- /dev/null
+++ b/.github/workflows/nano-server.yml
@@ -0,0 +1,76 @@
+name: Windows Nano Server tests
+
+on:
+  workflow_dispatch:
+
+env:
+  DEVELOPER: 1
+
+jobs:
+  test-nano-server:
+    runs-on: windows-2022
+    env:
+      WINDBG_DIR: "C:/Program Files (x86)/Windows Kits/10/Debuggers/x64"
+      IMAGE: mcr.microsoft.com/powershell:nanoserver-ltsc2022
+
+    steps:
+      - uses: actions/checkout@v3
+      - uses: git-for-windows/setup-git-for-windows-sdk@v1
+      - name: build Git
+        shell: bash
+        run: make -j15
+      - name: pull nanoserver image
+        shell: bash
+        run: docker pull $IMAGE
+      - name: run nano-server test
+        shell: bash
+        run: |
+          docker run \
+            --user "ContainerAdministrator" \
+            -v "$WINDBG_DIR:C:/dbg" \
+            -v "$(cygpath -aw /mingw64/bin):C:/mingw64-bin" \
+            -v "$(cygpath -aw .):C:/test" \
+            $IMAGE pwsh.exe -Command '
+              # Extend the PATH to include the `.dll` files in /mingw64/bin/
+              $env:PATH += ";C:\mingw64-bin"
+
+              # For each executable to test pick some no-operation set of
+              # flags/subcommands or something that should quickly result in an
+              # error with known exit code that is not a negative 32-bit
+              # number, and set the expected return code appropriately.
+              #
+              # Only test executables that could be expected to run in a UI
+              # less environment.
+              #
+              # ( Executable path, arguments, expected return code )
+              # also note space is required before close parenthesis (a
+              # powershell quirk when defining nested arrays like this)
+
+              $executables_to_test = @(
+                  ("C:\test\git.exe", "", 1 ),
+                  ("C:\test\scalar.exe", "version", 0 )
+              )
+
+              foreach ($executable in $executables_to_test)
+              {
+                  Write-Output "Now testing $($executable[0])"
+                  &$executable[0] $executable[1]
+                  if ($LASTEXITCODE -ne $executable[2]) {
+                      # if we failed, run the debugger to find out what function
+                      # or DLL could not be found and then exit the script with
+                      # failure The missing DLL or EXE will be referenced near
+                      # the end of the output
+
+                      # Set a flag to have the debugger show loader stub
+                      # diagnostics. This requires running as administrator,
+                      # otherwise the flag will be ignored.
+                      C:\dbg\gflags -i $executable[0] +SLS
+
+                      C:\dbg\cdb.exe -c "g" -c "q" $executable[0] $executable[1]
+
+                      exit 1
+                  }
+              }
+
+              exit 0
+            '
diff --git a/.gitignore b/.gitignore
index 5e56e471b3..53066da66b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -246,3 +246,4 @@ Release/
 /git.VC.db
 *.dSYM
 /contrib/buildsystems/out
+CMakeSettings.json
diff --git a/Documentation/config.txt b/Documentation/config.txt
index e3a74dd1c1..07fabccdf3 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -509,6 +509,8 @@ include::config/safe.txt[]
 
 include::config/sendemail.txt[]
 
+include::config/sendpack.txt[]
+
 include::config/sequencer.txt[]
 
 include::config/showbranch.txt[]
@@ -545,4 +547,6 @@ include::config/versionsort.txt[]
 
 include::config/web.txt[]
 
+include::config/windows.txt[]
+
 include::config/worktree.txt[]
diff --git a/Documentation/config/http.txt b/Documentation/config/http.txt
index 2d4e0c9b86..fe4acf561b 100644
--- a/Documentation/config/http.txt
+++ b/Documentation/config/http.txt
@@ -189,11 +189,13 @@ http.sslBackend::
 
 http.schannelCheckRevoke::
 	Used to enforce or disable certificate revocation checks in cURL
-	when http.sslBackend is set to "schannel". Defaults to `true` if
-	unset. Only necessary to disable this if Git consistently errors
-	and the message is about checking the revocation status of a
-	certificate. This option is ignored if cURL lacks support for
-	setting the relevant SSL option at runtime.
+	when http.sslBackend is set to "schannel" via "true" and "false",
+	respectively. Another accepted value is "best-effort" (the default)
+	in which case revocation checks are performed, but errors due to
+	revocation list distribution points that are offline are silently
+	ignored, as well as errors due to certificates missing revocation
+	list distribution points. This option is ignored if cURL lacks
+	support for setting the relevant SSL option at runtime.
 
 http.schannelUseSSLCAInfo::
 	As of cURL v7.60.0, the Secure Channel backend can use the
@@ -203,6 +205,11 @@ http.schannelUseSSLCAInfo::
 	when the `schannel` backend was configured via `http.sslBackend`,
 	unless `http.schannelUseSSLCAInfo` overrides this behavior.
 
+http.sslAutoClientCert::
+	As of cURL v7.77.0, the Secure Channel backend won't automatically
+	send client certificates from the Windows Certificate Store anymore.
+	To opt in to the old behavior, http.sslAutoClientCert can be set.
+
 http.pinnedPubkey::
 	Public key of the https service. It may either be the filename of
 	a PEM or DER encoded public key file or a string starting with
diff --git a/Documentation/config/sendpack.txt b/Documentation/config/sendpack.txt
new file mode 100644
index 0000000000..e306f657fb
--- /dev/null
+++ b/Documentation/config/sendpack.txt
@@ -0,0 +1,5 @@
+sendpack.sideband::
+	Allows to disable the side-band-64k capability for send-pack even
+	when it is advertised by the server. Makes it possible to work
+	around a limitation in the git for windows implementation together
+	with the dump git protocol. Defaults to true.
diff --git a/Documentation/config/windows.txt b/Documentation/config/windows.txt
new file mode 100644
index 0000000000..fdaaf1c655
--- /dev/null
+++ b/Documentation/config/windows.txt
@@ -0,0 +1,4 @@
+windows.appendAtomically::
+	By default, append atomic API is used on windows. But it works only with
+	local disk files, if you're working on a network file system, you should
+	set it false to turn it off.
diff --git a/Makefile b/Makefile
index 03adcb5a48..4bbd777168 100644
--- a/Makefile
+++ b/Makefile
@@ -464,6 +464,11 @@ include shared.mak
 #
 #     CURL_LDFLAGS=-lcurl
 #
+# Define LAZYLOAD_LIBCURL to dynamically load the libcurl; This can be useful
+# if Multiple libcurl versions exist (with different file names) that link to
+# various SSL/TLS backends, to support the `http.sslBackend` runtime switch in
+# such a scenario.
+#
 # === Optional library: libpcre2 ===
 #
 # Define USE_LIBPCRE if you have and want to use libpcre. Various
@@ -1328,6 +1333,7 @@ BUILTIN_OBJS += builtin/write-tree.o
 # upstream unnecessarily (making merging in future changes easier).
 THIRD_PARTY_SOURCES += compat/inet_ntop.c
 THIRD_PARTY_SOURCES += compat/inet_pton.c
+THIRD_PARTY_SOURCES += compat/mimalloc/%
 THIRD_PARTY_SOURCES += compat/nedmalloc/%
 THIRD_PARTY_SOURCES += compat/obstack.%
 THIRD_PARTY_SOURCES += compat/poll/%
@@ -1601,10 +1607,23 @@ else
 		CURL_LIBCURL =
 	endif
 
-	ifndef CURL_LDFLAGS
-		CURL_LDFLAGS = $(eval CURL_LDFLAGS := $$(shell $$(CURL_CONFIG) --libs))$(CURL_LDFLAGS)
+	ifdef LAZYLOAD_LIBCURL
+		LAZYLOAD_LIBCURL_OBJ = compat/lazyload-curl.o
+		OBJECTS += $(LAZYLOAD_LIBCURL_OBJ)
+		# The `CURL_STATICLIB` constant must be defined to avoid seeing the functions
+		# declared as DLL imports
+		CURL_CFLAGS = -DCURL_STATICLIB
+ifneq ($(uname_S),MINGW)
+ifneq ($(uname_S),Windows)
+		CURL_LIBCURL = -ldl
+endif
+endif
+	else
+		ifndef CURL_LDFLAGS
+			CURL_LDFLAGS = $(eval CURL_LDFLAGS := $$(shell $$(CURL_CONFIG) --libs))$(CURL_LDFLAGS)
+		endif
+		CURL_LIBCURL += $(CURL_LDFLAGS)
 	endif
-	CURL_LIBCURL += $(CURL_LDFLAGS)
 
 	ifndef CURL_CFLAGS
 		CURL_CFLAGS = $(eval CURL_CFLAGS := $$(shell $$(CURL_CONFIG) --cflags))$(CURL_CFLAGS)
@@ -1625,7 +1644,7 @@ else
 	endif
 	ifdef USE_CURL_FOR_IMAP_SEND
 		BASIC_CFLAGS += -DUSE_CURL_FOR_IMAP_SEND
-		IMAP_SEND_BUILDDEPS = http.o
+		IMAP_SEND_BUILDDEPS = http.o $(LAZYLOAD_LIBCURL_OBJ)
 		IMAP_SEND_LDFLAGS += $(CURL_LIBCURL)
 	endif
 	ifndef NO_EXPAT
@@ -2056,6 +2075,41 @@ ifdef USE_NED_ALLOCATOR
 	OVERRIDE_STRDUP = YesPlease
 endif
 
+ifdef USE_MIMALLOC
+	MIMALLOC_OBJS = \
+		compat/mimalloc/alloc-aligned.o \
+		compat/mimalloc/alloc.o \
+		compat/mimalloc/arena.o \
+		compat/mimalloc/bitmap.o \
+		compat/mimalloc/heap.o \
+		compat/mimalloc/init.o \
+		compat/mimalloc/options.o \
+		compat/mimalloc/os.o \
+		compat/mimalloc/page.o \
+		compat/mimalloc/random.o \
+		compat/mimalloc/segment.o \
+		compat/mimalloc/segment-cache.o \
+		compat/mimalloc/stats.o
+
+	COMPAT_CFLAGS += -Icompat/mimalloc -DMI_DEBUG=0 -DUSE_MIMALLOC --std=gnu11
+	COMPAT_OBJS += $(MIMALLOC_OBJS)
+
+$(MIMALLOC_OBJS): COMPAT_CFLAGS += -DBANNED_H
+
+$(MIMALLOC_OBJS): COMPAT_CFLAGS += \
+	-Wno-attributes \
+	-Wno-unknown-pragmas \
+	-Wno-array-bounds
+
+ifdef DEVELOPER
+$(MIMALLOC_OBJS): COMPAT_CFLAGS += \
+	-Wno-pedantic \
+	-Wno-declaration-after-statement \
+	-Wno-old-style-definition \
+	-Wno-missing-prototypes
+endif
+endif
+
 ifdef OVERRIDE_STRDUP
 	COMPAT_CFLAGS += -DOVERRIDE_STRDUP
 	COMPAT_OBJS += compat/strdup.o
@@ -2796,10 +2850,10 @@ git-imap-send$X: imap-send.o $(IMAP_SEND_BUILDDEPS) GIT-LDFLAGS $(GITLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
 		$(IMAP_SEND_LDFLAGS) $(LIBS)
 
-git-http-fetch$X: http.o http-walker.o http-fetch.o GIT-LDFLAGS $(GITLIBS)
+git-http-fetch$X: http.o http-walker.o http-fetch.o $(LAZYLOAD_LIBCURL_OBJ) GIT-LDFLAGS $(GITLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
 		$(CURL_LIBCURL) $(LIBS)
-git-http-push$X: http.o http-push.o GIT-LDFLAGS $(GITLIBS)
+git-http-push$X: http.o http-push.o $(LAZYLOAD_LIBCURL_OBJ) GIT-LDFLAGS $(GITLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
 		$(CURL_LIBCURL) $(EXPAT_LIBEXPAT) $(LIBS)
 
@@ -2809,7 +2863,7 @@ $(REMOTE_CURL_ALIASES): $(REMOTE_CURL_PRIMARY)
 	ln -s $< $@ 2>/dev/null || \
 	cp $< $@
 
-$(REMOTE_CURL_PRIMARY): remote-curl.o http.o http-walker.o GIT-LDFLAGS $(GITLIBS)
+$(REMOTE_CURL_PRIMARY): remote-curl.o http.o http-walker.o $(LAZYLOAD_LIBCURL_OBJ) GIT-LDFLAGS $(GITLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
 		$(CURL_LIBCURL) $(EXPAT_LIBEXPAT) $(LIBS)
 
@@ -3699,12 +3753,15 @@ ifdef MSVC
 	$(RM) $(patsubst %.o,%.o.pdb,$(OBJECTS))
 	$(RM) headless-git.o.pdb
 	$(RM) $(patsubst %.exe,%.pdb,$(OTHER_PROGRAMS))
+	$(RM) $(patsubst %.exe,%.ilk,$(OTHER_PROGRAMS))
 	$(RM) $(patsubst %.exe,%.iobj,$(OTHER_PROGRAMS))
 	$(RM) $(patsubst %.exe,%.ipdb,$(OTHER_PROGRAMS))
 	$(RM) $(patsubst %.exe,%.pdb,$(PROGRAMS))
+	$(RM) $(patsubst %.exe,%.ilk,$(PROGRAMS))
 	$(RM) $(patsubst %.exe,%.iobj,$(PROGRAMS))
 	$(RM) $(patsubst %.exe,%.ipdb,$(PROGRAMS))
 	$(RM) $(patsubst %.exe,%.pdb,$(TEST_PROGRAMS))
+	$(RM) $(patsubst %.exe,%.ilk,$(TEST_PROGRAMS))
 	$(RM) $(patsubst %.exe,%.iobj,$(TEST_PROGRAMS))
 	$(RM) $(patsubst %.exe,%.ipdb,$(TEST_PROGRAMS))
 	$(RM) compat/vcbuild/MSVC-DEFS-GEN
diff --git a/abspath.c b/abspath.c
index 1202cde23d..0c17e98654 100644
--- a/abspath.c
+++ b/abspath.c
@@ -93,6 +93,9 @@ static char *strbuf_realpath_1(struct strbuf *resolved, const char *path,
 			goto error_out;
 	}
 
+	if (platform_strbuf_realpath(resolved, path))
+		return resolved->buf;
+
 	strbuf_addstr(&remaining, path);
 	get_root_part(resolved, &remaining);
 
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
new file mode 100644
index 0000000000..fed26341d4
--- /dev/null
+++ b/azure-pipelines.yml
@@ -0,0 +1,393 @@
+variables:
+  Agent.Source.Git.ShallowFetchDepth: 1
+  GIT_CONFIG_PARAMETERS: "'checkout.workers=56' 'user.name=CI' 'user.email=ci@git'"
+
+jobs:
+- job: windows_build
+  displayName: Windows Build
+  condition: succeeded()
+  pool:
+    vmImage: windows-latest
+  timeoutInMinutes: 240
+  steps:
+  - bash: git clone --bare --depth=1 --filter=blob:none --single-branch -b main https://github.com/git-for-windows/git-sdk-64
+    displayName: 'clone git-sdk-64'
+  - bash: git clone --depth=1 --single-branch -b main https://github.com/git-for-windows/build-extra
+    displayName: 'clone build-extra'
+  - bash: sh -x ./build-extra/please.sh create-sdk-artifact --sdk=git-sdk-64.git --out=git-sdk-64-minimal minimal-sdk
+    displayName: 'build git-sdk-64-minimal-sdk'
+  - bash: |
+      # Let Git ignore the SDK and the test-cache
+      printf "%s\n" /git-sdk-64.git/ /build-extra/ /git-sdk-64-minimal/ /test-cache/ >>'.git/info/exclude'
+    displayName: 'Ignore untracked directories'
+  - bash: ci/make-test-artifacts.sh artifacts
+    displayName: Build
+    env:
+      HOME: $(Build.SourcesDirectory)
+      MSYSTEM: MINGW64
+      DEVELOPER: 1
+      NO_PERL: 1
+      PATH: "$(Build.SourcesDirectory)\\git-sdk-64-minimal\\mingw64\\bin;$(Build.SourcesDirectory)\\git-sdk-64-minimal\\usr\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\system32\\wbem"
+  - task: PublishPipelineArtifact@0
+    displayName: 'Publish Pipeline Artifact: test artifacts'
+    inputs:
+      artifactName: 'windows-artifacts'
+      targetPath: '$(Build.SourcesDirectory)\artifacts'
+  - task: PublishPipelineArtifact@0
+    displayName: 'Publish Pipeline Artifact: git-sdk-64-minimal'
+    inputs:
+      artifactName: 'git-sdk-64-minimal'
+      targetPath: '$(Build.SourcesDirectory)\git-sdk-64-minimal'
+
+- job: windows_test
+  displayName: Windows Test
+  dependsOn: windows_build
+  condition: succeeded()
+  pool:
+    vmImage: windows-latest
+  timeoutInMinutes: 240
+  strategy:
+    parallel: 10
+  steps:
+  - task: DownloadPipelineArtifact@0
+    displayName: 'Download Pipeline Artifact: test artifacts'
+    inputs:
+      artifactName: 'windows-artifacts'
+      targetPath: '$(Build.SourcesDirectory)'
+  - task: DownloadPipelineArtifact@0
+    displayName: 'Download Pipeline Artifact: git-sdk-64-minimal'
+    inputs:
+      artifactName: 'git-sdk-64-minimal'
+      targetPath: '$(Build.SourcesDirectory)\git-sdk-64-minimal'
+  - bash: |
+      test -f artifacts.tar.gz || {
+        echo No test artifacts found\; skipping >&2
+        exit 0
+      }
+      tar xf artifacts.tar.gz || exit 1
+
+      # Let Git ignore the SDK and the test-cache
+      printf '%s\n' /git-sdk-64.git/ /build-extra/ /git-sdk-64-minimal/ /test-cache/ >>.git/info/exclude
+
+      ci/run-test-slice.sh $SYSTEM_JOBPOSITIONINPHASE $SYSTEM_TOTALJOBSINPHASE || {
+        ci/print-test-failures.sh
+        exit 1
+      }
+    displayName: 'Test (parallel)'
+    env:
+      HOME: $(Build.SourcesDirectory)
+      MSYSTEM: MINGW64
+      NO_SVN_TESTS: 1
+      GIT_TEST_SKIP_REBASE_P: 1
+      PATH: "$(Build.SourcesDirectory)\\git-sdk-64-minimal\\mingw64\\bin;$(Build.SourcesDirectory)\\git-sdk-64-minimal\\usr\\bin\\core_perl;$(Build.SourcesDirectory)\\git-sdk-64-minimal\\usr\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\system32\\wbem"
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'windows'
+      platform: Windows
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-test-artifacts
+
+- job: vs_build
+  displayName: Visual Studio Build
+  condition: succeeded()
+  pool:
+    vmImage: windows-latest
+  timeoutInMinutes: 240
+  steps:
+  - bash: git clone --bare --depth=1 --filter=blob:none --single-branch -b main https://github.com/git-for-windows/git-sdk-64
+    displayName: 'clone git-sdk-64'
+  - bash: git clone --depth=1 --single-branch -b main https://github.com/git-for-windows/build-extra
+    displayName: 'clone build-extra'
+  - bash: sh -x ./build-extra/please.sh create-sdk-artifact --sdk=git-sdk-64.git --out=git-sdk-64-minimal minimal-sdk
+    displayName: 'build git-sdk-64-minimal-sdk'
+  - bash: |
+      # Let Git ignore the SDK and the test-cache
+      printf "%s\n" /git-sdk-64-minimal/ /test-cache/ >>'.git/info/exclude'
+    displayName: 'Ignore untracked directories'
+  - bash: make NDEBUG=1 DEVELOPER=1 vcxproj
+    displayName: Generate Visual Studio Solution
+    env:
+      HOME: $(Build.SourcesDirectory)
+      MSYSTEM: MINGW64
+      DEVELOPER: 1
+      NO_PERL: 1
+      PATH: "$(Build.SourcesDirectory)\\git-sdk-64-minimal\\mingw64\\bin;$(Build.SourcesDirectory)\\git-sdk-64-minimal\\usr\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\system32\\wbem"
+  - powershell: |
+      $urlbase = "https://dev.azure.com/git/git/_apis/build/builds"
+      $id = ((Invoke-WebRequest -UseBasicParsing "${urlbase}?definitions=9&statusFilter=completed&resultFilter=succeeded&`$top=1").content | ConvertFrom-JSON).value[0].id
+      $downloadUrl = ((Invoke-WebRequest -UseBasicParsing "${urlbase}/$id/artifacts").content | ConvertFrom-JSON).value[0].resource.downloadUrl
+      (New-Object Net.WebClient).DownloadFile($downloadUrl, "compat.zip")
+      Expand-Archive compat.zip -DestinationPath . -Force
+      Remove-Item compat.zip
+    displayName: 'Download vcpkg artifacts'
+  - task: MSBuild@1
+    inputs:
+      solution: git.sln
+      platform: x64
+      configuration: Release
+      maximumCpuCount: 4
+      msbuildArguments: /p:PlatformToolset=v142
+  - bash: |
+      ./compat/vcbuild/vcpkg_copy_dlls.bat release &&
+      mkdir -p artifacts &&
+      eval "$(make -n artifacts-tar INCLUDE_DLLS_IN_ARTIFACTS=YesPlease ARTIFACTS_DIRECTORY=artifacts | grep ^tar)"
+    displayName: Bundle artifact tar
+    env:
+      HOME: $(Build.SourcesDirectory)
+      MSYSTEM: MINGW64
+      DEVELOPER: 1
+      NO_PERL: 1
+      MSVC: 1
+      VCPKG_ROOT: $(Build.SourcesDirectory)\compat\vcbuild\vcpkg
+      PATH: "$(Build.SourcesDirectory)\\git-sdk-64-minimal\\mingw64\\bin;$(Build.SourcesDirectory)\\git-sdk-64-minimal\\usr\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\system32\\wbem"
+  - powershell: |
+      $tag = (Invoke-WebRequest -UseBasicParsing "https://gitforwindows.org/latest-tag.txt").content
+      $version = (Invoke-WebRequest -UseBasicParsing "https://gitforwindows.org/latest-version.txt").content
+      $url = "https://github.com/git-for-windows/git/releases/download/${tag}/PortableGit-${version}-64-bit.7z.exe"
+      (New-Object Net.WebClient).DownloadFile($url,"PortableGit.exe")
+      & .\PortableGit.exe -y -oartifacts\PortableGit
+      # Wait until it is unpacked
+      while (-not @(Remove-Item -ErrorAction SilentlyContinue PortableGit.exe; $?)) { sleep 1 }
+    displayName: Download & extract portable Git
+  - task: PublishPipelineArtifact@0
+    displayName: 'Publish Pipeline Artifact: MSVC test artifacts'
+    inputs:
+      artifactName: 'vs-artifacts'
+      targetPath: '$(Build.SourcesDirectory)\artifacts'
+
+- job: vs_test
+  displayName: Visual Studio Test
+  dependsOn: vs_build
+  condition: succeeded()
+  pool:
+    vmImage: windows-latest
+  timeoutInMinutes: 240
+  strategy:
+    parallel: 10
+  steps:
+  - task: DownloadPipelineArtifact@0
+    displayName: 'Download Pipeline Artifact: VS test artifacts'
+    inputs:
+      artifactName: 'vs-artifacts'
+      targetPath: '$(Build.SourcesDirectory)'
+  - bash: |
+      test -f artifacts.tar.gz || {
+        echo No test artifacts found\; skipping >&2
+        exit 0
+      }
+      tar xf artifacts.tar.gz || exit 1
+
+      # Let Git ignore the SDK and the test-cache
+      printf '%s\n' /PortableGit/ /test-cache/ >>.git/info/exclude
+
+      cd t &&
+      PATH="$PWD/helper:$PATH" &&
+      test-tool.exe run-command testsuite --jobs=10 -V -x --write-junit-xml \
+              $(test-tool.exe path-utils slice-tests \
+                      $SYSTEM_JOBPOSITIONINPHASE $SYSTEM_TOTALJOBSINPHASE t[0-9]*.sh)
+    displayName: 'Test (parallel)'
+    env:
+      HOME: $(Build.SourcesDirectory)
+      MSYSTEM: MINGW64
+      NO_SVN_TESTS: 1
+      GIT_TEST_SKIP_REBASE_P: 1
+      PATH: "$(Build.SourcesDirectory)\\PortableGit\\mingw64\\bin;$(Build.SourcesDirectory)\\PortableGit\\usr\\bin;C:\\Windows\\system32;C:\\Windows;C:\\Windows\\system32\\wbem"
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'vs'
+      platform: Windows
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-vs-test-artifacts
+
+- job: linux_clang
+  displayName: linux-clang
+  condition: succeeded()
+  pool:
+    vmImage: ubuntu-latest
+  steps:
+  - bash: |
+       export CC=clang || exit 1
+
+       ci/install-dependencies.sh || exit 1
+       ci/run-build-and-tests.sh || {
+           ci/print-test-failures.sh
+           exit 1
+       }
+    displayName: 'ci/run-build-and-tests.sh'
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'linux-clang'
+      platform: Linux
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-test-artifacts
+
+- job: linux_gcc
+  displayName: linux-gcc
+  condition: succeeded()
+  pool:
+    vmImage: ubuntu-latest
+  steps:
+  - bash: |
+       ci/install-dependencies.sh || exit 1
+       ci/run-build-and-tests.sh || {
+           ci/print-test-failures.sh
+           exit 1
+       }
+    displayName: 'ci/run-build-and-tests.sh'
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'linux-gcc'
+      platform: Linux
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-test-artifacts
+
+- job: osx_clang
+  displayName: osx-clang
+  condition: succeeded()
+  pool:
+    vmImage: macOS-latest
+  steps:
+  - bash: |
+       export CC=clang
+
+       ci/install-dependencies.sh || exit 1
+       ci/run-build-and-tests.sh || {
+           ci/print-test-failures.sh
+           exit 1
+       }
+    displayName: 'ci/run-build-and-tests.sh'
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'osx-clang'
+      platform: macOS
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-test-artifacts
+
+- job: osx_gcc
+  displayName: osx-gcc
+  condition: succeeded()
+  pool:
+    vmImage: macOS-latest
+  steps:
+  - bash: |
+       ci/install-dependencies.sh || exit 1
+       ci/run-build-and-tests.sh || {
+           ci/print-test-failures.sh
+           exit 1
+       }
+    displayName: 'ci/run-build-and-tests.sh'
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'osx-gcc'
+      platform: macOS
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-test-artifacts
+
+- job: linux32
+  displayName: Linux32
+  condition: succeeded()
+  pool:
+    vmImage: ubuntu-latest
+  steps:
+  - bash: |
+       res=0
+       sudo AGENT_OS="$AGENT_OS" BUILD_BUILDNUMBER="$BUILD_BUILDNUMBER" BUILD_REPOSITORY_URI="$BUILD_REPOSITORY_URI" BUILD_SOURCEBRANCH="$BUILD_SOURCEBRANCH" BUILD_SOURCEVERSION="$BUILD_SOURCEVERSION" SYSTEM_PHASENAME="$SYSTEM_PHASENAME" SYSTEM_TASKDEFINITIONSURI="$SYSTEM_TASKDEFINITIONSURI" SYSTEM_TEAMPROJECT="$SYSTEM_TEAMPROJECT" CC=$CC MAKEFLAGS="$MAKEFLAGS" jobname=linux32 bash -lxc ci/run-docker.sh || res=1
+
+       sudo chmod a+r t/out/TEST-*.xml
+       test ! -d t/failed-test-artifacts || sudo chmod a+r t/failed-test-artifacts
+
+       exit $res
+    displayName: 'jobname=linux32 ci/run-docker.sh'
+  - task: PublishTestResults@2
+    displayName: 'Publish Test Results **/TEST-*.xml'
+    inputs:
+      mergeTestResults: true
+      testRunTitle: 'linux32'
+      platform: Linux
+      publishRunAttachments: false
+    condition: succeededOrFailed()
+  - task: PublishBuildArtifacts@1
+    displayName: 'Publish trash directories of failed tests'
+    condition: failed()
+    inputs:
+      PathtoPublish: t/failed-test-artifacts
+      ArtifactName: failed-test-artifacts
+
+- job: static_analysis
+  displayName: StaticAnalysis
+  condition: succeeded()
+  pool:
+    vmImage: ubuntu-22.04
+  steps:
+  - bash: |
+       sudo apt-get update &&
+       sudo apt-get install -y coccinelle libcurl4-openssl-dev libssl-dev libexpat-dev gettext &&
+
+       export jobname=StaticAnalysis &&
+
+       ci/run-static-analysis.sh || exit 1
+    displayName: 'ci/run-static-analysis.sh'
+
+- job: documentation
+  displayName: Documentation
+  condition: succeeded()
+  pool:
+    vmImage: ubuntu-latest
+  steps:
+  - bash: |
+       sudo apt-get update &&
+       sudo apt-get install -y asciidoc xmlto asciidoctor docbook-xsl-ns &&
+
+       export ALREADY_HAVE_ASCIIDOCTOR=yes. &&
+       export jobname=Documentation &&
+
+       ci/test-documentation.sh || exit 1
+    displayName: 'ci/test-documentation.sh'
diff --git a/builtin/clean.c b/builtin/clean.c
index 49c224e626..9eedb9239b 100644
--- a/builtin/clean.c
+++ b/builtin/clean.c
@@ -39,6 +39,10 @@ static const char *msg_remove = N_("Removing %s\n");
 static const char *msg_would_remove = N_("Would remove %s\n");
 static const char *msg_skip_git_dir = N_("Skipping repository %s\n");
 static const char *msg_would_skip_git_dir = N_("Would skip repository %s\n");
+#ifndef CAN_UNLINK_MOUNT_POINTS
+static const char *msg_skip_mount_point = N_("Skipping mount point %s\n");
+static const char *msg_would_skip_mount_point = N_("Would skip mount point %s\n");
+#endif
 static const char *msg_warn_remove_failed = N_("failed to remove %s");
 static const char *msg_warn_lstat_failed = N_("could not lstat %s\n");
 static const char *msg_skip_cwd = N_("Refusing to remove current working directory\n");
@@ -183,6 +187,29 @@ static int remove_dirs(struct strbuf *path, const char *prefix, int force_flag,
 		goto out;
 	}
 
+	if (is_mount_point(path)) {
+#ifndef CAN_UNLINK_MOUNT_POINTS
+		if (!quiet) {
+			quote_path(path->buf, prefix, &quoted, 0);
+			printf(dry_run ?
+			       _(msg_would_skip_mount_point) :
+			       _(msg_skip_mount_point), quoted.buf);
+		}
+		*dir_gone = 0;
+#else
+		if (!dry_run && unlink(path->buf)) {
+			int saved_errno = errno;
+			quote_path(path->buf, prefix, &quoted, 0);
+			errno = saved_errno;
+			warning_errno(_(msg_warn_remove_failed), quoted.buf);
+			*dir_gone = 0;
+			ret = -1;
+		}
+#endif
+
+		goto out;
+	}
+
 	dir = opendir(path->buf);
 	if (!dir) {
 		/* an empty dir could be removed even if it is unreadble */
diff --git a/ci/lib.sh b/ci/lib.sh
index bc0b23099d..df8452b93e 100755
--- a/ci/lib.sh
+++ b/ci/lib.sh
@@ -161,6 +161,12 @@ then
 	MAKEFLAGS="$MAKEFLAGS --jobs=10"
 	test windows_nt != "$CI_OS_NAME" ||
 	GIT_TEST_OPTS="--no-chain-lint --no-bin-wrappers $GIT_TEST_OPTS"
+	case "$CI_OS_NAME" in
+	linux) runs_on_pool=ubuntu-latest;;
+	macos|osx) runs_on_pool=macos-latest;;
+	windows_nt) runs_on_pool=windows-latest;;
+	*) echo "Unhandled OS: $CI_OS_NAME" >&2; exit 1;;
+	esac
 elif test true = "$GITHUB_ACTIONS"
 then
 	CI_TYPE=github-actions
diff --git a/ci/run-build-and-tests.sh b/ci/run-build-and-tests.sh
index 2528f25e31..3f1e108926 100755
--- a/ci/run-build-and-tests.sh
+++ b/ci/run-build-and-tests.sh
@@ -5,11 +5,6 @@
 
 . ${0%/*}/lib.sh
 
-case "$CI_OS_NAME" in
-windows*) cmd //c mklink //j t\\.prove "$(cygpath -aw "$cache_dir/.prove")";;
-*) ln -s "$cache_dir/.prove" t/.prove;;
-esac
-
 run_tests=t
 
 case "$jobname" in
@@ -53,4 +48,8 @@ then
 fi
 check_unignored_build_artifacts
 
+case " $MAKE_TARGETS " in
+*" all "*) make -C contrib/subtree test;;
+esac
+
 save_good_tree
diff --git a/ci/run-test-slice.sh b/ci/run-test-slice.sh
index a3c67956a8..279aff4da9 100755
--- a/ci/run-test-slice.sh
+++ b/ci/run-test-slice.sh
@@ -5,14 +5,12 @@
 
 . ${0%/*}/lib.sh
 
-case "$CI_OS_NAME" in
-windows*) cmd //c mklink //j t\\.prove "$(cygpath -aw "$cache_dir/.prove")";;
-*) ln -s "$cache_dir/.prove" t/.prove;;
-esac
-
 group "Run tests" make --quiet -C t T="$(cd t &&
 	./helper/test-tool path-utils slice-tests "$1" "$2" t[0-9]*.sh |
 	tr '\n' ' ')" ||
 handle_failed_tests
 
+# Run the git subtree tests only if main tests succeeded
+test 0 != "$1" || make -C contrib/subtree test
+
 check_unignored_build_artifacts
diff --git a/compat/bswap.h b/compat/bswap.h
index 512f6f4b99..a443e99eef 100644
--- a/compat/bswap.h
+++ b/compat/bswap.h
@@ -35,7 +35,19 @@ static inline uint64_t default_bswap64(uint64_t val)
 #undef bswap32
 #undef bswap64
 
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+/**
+ * __has_builtin is available since Clang 10 and GCC 10.
+ * Below is a fallback for older compilers.
+ */
+#ifndef __has_builtin
+	#define __has_builtin(x) 0
+#endif
+
+#if __has_builtin(__builtin_bswap32) && __has_builtin(__builtin_bswap64)
+#define bswap32(x) __builtin_bswap32((x))
+#define bswap64(x) __builtin_bswap64((x))
+
+#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 
 #define bswap32 git_bswap32
 static inline uint32_t git_bswap32(uint32_t x)
diff --git a/compat/lazyload-curl.c b/compat/lazyload-curl.c
new file mode 100644
index 0000000000..9315b4a81f
--- /dev/null
+++ b/compat/lazyload-curl.c
@@ -0,0 +1,408 @@
+#include "../git-compat-util.h"
+#include "../git-curl-compat.h"
+#ifndef WIN32
+#include <dlfcn.h>
+#endif
+
+/*
+ * The ABI version of libcurl is encoded in its shared libraries' file names.
+ * This ABI version has not changed since October 2006 and is unlikely to be
+ * changed in the future. See https://curl.se/libcurl/abi.html for details.
+ */
+#define LIBCURL_ABI_VERSION "4"
+
+typedef void (*func_t)(void);
+
+#ifndef WIN32
+#ifdef __APPLE__
+#define LIBCURL_FILE_NAME(base) base "." LIBCURL_ABI_VERSION ".dylib"
+#else
+#define LIBCURL_FILE_NAME(base) base ".so." LIBCURL_ABI_VERSION
+#endif
+
+static void *load_library(const char *name)
+{
+	return dlopen(name, RTLD_LAZY);
+}
+
+static func_t load_function(void *handle, const char *name)
+{
+	/*
+	 * Casting the return value of `dlsym()` to a function pointer is
+	 * explicitly allowed in recent POSIX standards, but GCC complains
+	 * about this in pedantic mode nevertheless. For more about this issue,
+	 * see https://stackoverflow.com/q/31526876/1860823 and
+	 * http://stackoverflow.com/a/36385690/1905491.
+	 */
+	func_t f;
+	*(void **)&f = dlsym(handle, name);
+	return f;
+}
+#else
+#define LIBCURL_FILE_NAME(base) base "-" LIBCURL_ABI_VERSION ".dll"
+
+static void *load_library(const char *name)
+{
+	size_t name_size = strlen(name) + 1;
+	const char *path = getenv("PATH");
+	char dll_path[MAX_PATH];
+
+	while (path && *path) {
+		const char *sep = strchrnul(path, ';');
+		size_t len = sep - path;
+
+		if (len && len + name_size < sizeof(dll_path)) {
+			memcpy(dll_path, path, len);
+			dll_path[len] = '/';
+			memcpy(dll_path + len + 1, name, name_size);
+
+			if (!access(dll_path, R_OK)) {
+				wchar_t wpath[MAX_PATH];
+				int wlen = MultiByteToWideChar(CP_UTF8, 0, dll_path, -1, wpath, ARRAY_SIZE(wpath));
+				void *res = wlen ? (void *)LoadLibraryExW(wpath, NULL, 0) : NULL;
+				if (!res) {
+					DWORD err = GetLastError();
+					char buf[1024];
+
+					if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM |
+							    FORMAT_MESSAGE_ARGUMENT_ARRAY |
+							    FORMAT_MESSAGE_IGNORE_INSERTS,
+							    NULL, err, LANG_NEUTRAL,
+							    buf, sizeof(buf) - 1, NULL))
+						xsnprintf(buf, sizeof(buf), "last error: %ld", err);
+					error("LoadLibraryExW() failed with: %s", buf);
+				}
+				return res;
+			}
+		}
+
+		path = *sep ? sep + 1 : NULL;
+	}
+
+	return NULL;
+}
+
+static func_t load_function(void *handle, const char *name)
+{
+	return (func_t)GetProcAddress((HANDLE)handle, name);
+}
+#endif
+
+typedef char *(*curl_easy_escape_type)(CURL *handle, const char *string, int length);
+static curl_easy_escape_type curl_easy_escape_func;
+
+typedef void (*curl_free_type)(void *p);
+static curl_free_type curl_free_func;
+
+typedef CURLcode (*curl_global_init_type)(long flags);
+static curl_global_init_type curl_global_init_func;
+
+typedef CURLsslset (*curl_global_sslset_type)(curl_sslbackend id, const char *name, const curl_ssl_backend ***avail);
+static curl_global_sslset_type curl_global_sslset_func;
+
+typedef void (*curl_global_cleanup_type)(void);
+static curl_global_cleanup_type curl_global_cleanup_func;
+
+typedef struct curl_slist *(*curl_slist_append_type)(struct curl_slist *list, const char *data);
+static curl_slist_append_type curl_slist_append_func;
+
+typedef void (*curl_slist_free_all_type)(struct curl_slist *list);
+static curl_slist_free_all_type curl_slist_free_all_func;
+
+typedef const char *(*curl_easy_strerror_type)(CURLcode error);
+static curl_easy_strerror_type curl_easy_strerror_func;
+
+typedef CURLM *(*curl_multi_init_type)(void);
+static curl_multi_init_type curl_multi_init_func;
+
+typedef CURLMcode (*curl_multi_add_handle_type)(CURLM *multi_handle, CURL *curl_handle);
+static curl_multi_add_handle_type curl_multi_add_handle_func;
+
+typedef CURLMcode (*curl_multi_remove_handle_type)(CURLM *multi_handle, CURL *curl_handle);
+static curl_multi_remove_handle_type curl_multi_remove_handle_func;
+
+typedef CURLMcode (*curl_multi_fdset_type)(CURLM *multi_handle, fd_set *read_fd_set, fd_set *write_fd_set, fd_set *exc_fd_set, int *max_fd);
+static curl_multi_fdset_type curl_multi_fdset_func;
+
+typedef CURLMcode (*curl_multi_perform_type)(CURLM *multi_handle, int *running_handles);
+static curl_multi_perform_type curl_multi_perform_func;
+
+typedef CURLMcode (*curl_multi_cleanup_type)(CURLM *multi_handle);
+static curl_multi_cleanup_type curl_multi_cleanup_func;
+
+typedef CURLMsg *(*curl_multi_info_read_type)(CURLM *multi_handle, int *msgs_in_queue);
+static curl_multi_info_read_type curl_multi_info_read_func;
+
+typedef const char *(*curl_multi_strerror_type)(CURLMcode error);
+static curl_multi_strerror_type curl_multi_strerror_func;
+
+typedef CURLMcode (*curl_multi_timeout_type)(CURLM *multi_handle, long *milliseconds);
+static curl_multi_timeout_type curl_multi_timeout_func;
+
+typedef CURL *(*curl_easy_init_type)(void);
+static curl_easy_init_type curl_easy_init_func;
+
+typedef CURLcode (*curl_easy_perform_type)(CURL *curl);
+static curl_easy_perform_type curl_easy_perform_func;
+
+typedef void (*curl_easy_cleanup_type)(CURL *curl);
+static curl_easy_cleanup_type curl_easy_cleanup_func;
+
+typedef CURL *(*curl_easy_duphandle_type)(CURL *curl);
+static curl_easy_duphandle_type curl_easy_duphandle_func;
+
+typedef CURLcode (*curl_easy_getinfo_long_type)(CURL *curl, CURLINFO info, long *value);
+static curl_easy_getinfo_long_type curl_easy_getinfo_long_func;
+
+typedef CURLcode (*curl_easy_getinfo_pointer_type)(CURL *curl, CURLINFO info, void **value);
+static curl_easy_getinfo_pointer_type curl_easy_getinfo_pointer_func;
+
+typedef CURLcode (*curl_easy_getinfo_off_t_type)(CURL *curl, CURLINFO info, curl_off_t *value);
+static curl_easy_getinfo_off_t_type curl_easy_getinfo_off_t_func;
+
+typedef CURLcode (*curl_easy_setopt_long_type)(CURL *curl, CURLoption opt, long value);
+static curl_easy_setopt_long_type curl_easy_setopt_long_func;
+
+typedef CURLcode (*curl_easy_setopt_pointer_type)(CURL *curl, CURLoption opt, void *value);
+static curl_easy_setopt_pointer_type curl_easy_setopt_pointer_func;
+
+typedef CURLcode (*curl_easy_setopt_off_t_type)(CURL *curl, CURLoption opt, curl_off_t value);
+static curl_easy_setopt_off_t_type curl_easy_setopt_off_t_func;
+
+static char ssl_backend[64];
+
+static void lazy_load_curl(void)
+{
+	static int initialized;
+	void *libcurl = NULL;
+	func_t curl_easy_getinfo_func, curl_easy_setopt_func;
+
+	if (initialized)
+		return;
+
+	initialized = 1;
+	if (ssl_backend[0]) {
+		char dll_name[64 + 16];
+		snprintf(dll_name, sizeof(dll_name) - 1,
+			 LIBCURL_FILE_NAME("libcurl-%s"), ssl_backend);
+		libcurl = load_library(dll_name);
+	}
+	if (!libcurl)
+		libcurl = load_library(LIBCURL_FILE_NAME("libcurl"));
+	if (!libcurl)
+		die("failed to load library '%s'", LIBCURL_FILE_NAME("libcurl"));
+
+	curl_easy_escape_func = (curl_easy_escape_type)load_function(libcurl, "curl_easy_escape");
+	curl_free_func = (curl_free_type)load_function(libcurl, "curl_free");
+	curl_global_init_func = (curl_global_init_type)load_function(libcurl, "curl_global_init");
+	curl_global_sslset_func = (curl_global_sslset_type)load_function(libcurl, "curl_global_sslset");
+	curl_global_cleanup_func = (curl_global_cleanup_type)load_function(libcurl, "curl_global_cleanup");
+	curl_slist_append_func = (curl_slist_append_type)load_function(libcurl, "curl_slist_append");
+	curl_slist_free_all_func = (curl_slist_free_all_type)load_function(libcurl, "curl_slist_free_all");
+	curl_easy_strerror_func = (curl_easy_strerror_type)load_function(libcurl, "curl_easy_strerror");
+	curl_multi_init_func = (curl_multi_init_type)load_function(libcurl, "curl_multi_init");
+	curl_multi_add_handle_func = (curl_multi_add_handle_type)load_function(libcurl, "curl_multi_add_handle");
+	curl_multi_remove_handle_func = (curl_multi_remove_handle_type)load_function(libcurl, "curl_multi_remove_handle");
+	curl_multi_fdset_func = (curl_multi_fdset_type)load_function(libcurl, "curl_multi_fdset");
+	curl_multi_perform_func = (curl_multi_perform_type)load_function(libcurl, "curl_multi_perform");
+	curl_multi_cleanup_func = (curl_multi_cleanup_type)load_function(libcurl, "curl_multi_cleanup");
+	curl_multi_info_read_func = (curl_multi_info_read_type)load_function(libcurl, "curl_multi_info_read");
+	curl_multi_strerror_func = (curl_multi_strerror_type)load_function(libcurl, "curl_multi_strerror");
+	curl_multi_timeout_func = (curl_multi_timeout_type)load_function(libcurl, "curl_multi_timeout");
+	curl_easy_init_func = (curl_easy_init_type)load_function(libcurl, "curl_easy_init");
+	curl_easy_perform_func = (curl_easy_perform_type)load_function(libcurl, "curl_easy_perform");
+	curl_easy_cleanup_func = (curl_easy_cleanup_type)load_function(libcurl, "curl_easy_cleanup");
+	curl_easy_duphandle_func = (curl_easy_duphandle_type)load_function(libcurl, "curl_easy_duphandle");
+
+	curl_easy_getinfo_func = load_function(libcurl, "curl_easy_getinfo");
+	curl_easy_getinfo_long_func = (curl_easy_getinfo_long_type)curl_easy_getinfo_func;
+	curl_easy_getinfo_pointer_func = (curl_easy_getinfo_pointer_type)curl_easy_getinfo_func;
+	curl_easy_getinfo_off_t_func = (curl_easy_getinfo_off_t_type)curl_easy_getinfo_func;
+
+	curl_easy_setopt_func = load_function(libcurl, "curl_easy_setopt");
+	curl_easy_setopt_long_func = (curl_easy_setopt_long_type)curl_easy_setopt_func;
+	curl_easy_setopt_pointer_func = (curl_easy_setopt_pointer_type)curl_easy_setopt_func;
+	curl_easy_setopt_off_t_func = (curl_easy_setopt_off_t_type)curl_easy_setopt_func;
+}
+
+char *curl_easy_escape(CURL *handle, const char *string, int length)
+{
+	lazy_load_curl();
+	return curl_easy_escape_func(handle, string, length);
+}
+
+void curl_free(void *p)
+{
+	lazy_load_curl();
+	curl_free_func(p);
+}
+
+CURLcode curl_global_init(long flags)
+{
+	lazy_load_curl();
+	return curl_global_init_func(flags);
+}
+
+CURLsslset curl_global_sslset(curl_sslbackend id, const char *name, const curl_ssl_backend ***avail)
+{
+	if (name && strlen(name) < sizeof(ssl_backend))
+		strlcpy(ssl_backend, name, sizeof(ssl_backend));
+
+	lazy_load_curl();
+	return curl_global_sslset_func(id, name, avail);
+}
+
+void curl_global_cleanup(void)
+{
+	lazy_load_curl();
+	curl_global_cleanup_func();
+}
+
+struct curl_slist *curl_slist_append(struct curl_slist *list, const char *data)
+{
+	lazy_load_curl();
+	return curl_slist_append_func(list, data);
+}
+
+void curl_slist_free_all(struct curl_slist *list)
+{
+	lazy_load_curl();
+	curl_slist_free_all_func(list);
+}
+
+const char *curl_easy_strerror(CURLcode error)
+{
+	lazy_load_curl();
+	return curl_easy_strerror_func(error);
+}
+
+CURLM *curl_multi_init(void)
+{
+	lazy_load_curl();
+	return curl_multi_init_func();
+}
+
+CURLMcode curl_multi_add_handle(CURLM *multi_handle, CURL *curl_handle)
+{
+	lazy_load_curl();
+	return curl_multi_add_handle_func(multi_handle, curl_handle);
+}
+
+CURLMcode curl_multi_remove_handle(CURLM *multi_handle, CURL *curl_handle)
+{
+	lazy_load_curl();
+	return curl_multi_remove_handle_func(multi_handle, curl_handle);
+}
+
+CURLMcode curl_multi_fdset(CURLM *multi_handle, fd_set *read_fd_set, fd_set *write_fd_set, fd_set *exc_fd_set, int *max_fd)
+{
+	lazy_load_curl();
+	return curl_multi_fdset_func(multi_handle, read_fd_set, write_fd_set, exc_fd_set, max_fd);
+}
+
+CURLMcode curl_multi_perform(CURLM *multi_handle, int *running_handles)
+{
+	lazy_load_curl();
+	return curl_multi_perform_func(multi_handle, running_handles);
+}
+
+CURLMcode curl_multi_cleanup(CURLM *multi_handle)
+{
+	lazy_load_curl();
+	return curl_multi_cleanup_func(multi_handle);
+}
+
+CURLMsg *curl_multi_info_read(CURLM *multi_handle, int *msgs_in_queue)
+{
+	lazy_load_curl();
+	return curl_multi_info_read_func(multi_handle, msgs_in_queue);
+}
+
+const char *curl_multi_strerror(CURLMcode error)
+{
+	lazy_load_curl();
+	return curl_multi_strerror_func(error);
+}
+
+CURLMcode curl_multi_timeout(CURLM *multi_handle, long *milliseconds)
+{
+	lazy_load_curl();
+	return curl_multi_timeout_func(multi_handle, milliseconds);
+}
+
+CURL *curl_easy_init(void)
+{
+	lazy_load_curl();
+	return curl_easy_init_func();
+}
+
+CURLcode curl_easy_perform(CURL *curl)
+{
+	lazy_load_curl();
+	return curl_easy_perform_func(curl);
+}
+
+void curl_easy_cleanup(CURL *curl)
+{
+	lazy_load_curl();
+	curl_easy_cleanup_func(curl);
+}
+
+CURL *curl_easy_duphandle(CURL *curl)
+{
+	lazy_load_curl();
+	return curl_easy_duphandle_func(curl);
+}
+
+#ifndef CURL_IGNORE_DEPRECATION
+#define CURL_IGNORE_DEPRECATION(x) x
+#endif
+
+#ifndef CURLOPTTYPE_BLOB
+#define CURLOPTTYPE_BLOB 40000
+#endif
+
+#undef curl_easy_getinfo
+CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...)
+{
+	va_list ap;
+	CURLcode res;
+
+	va_start(ap, info);
+	lazy_load_curl();
+	CURL_IGNORE_DEPRECATION(
+		if (info >= CURLINFO_LONG && info < CURLINFO_DOUBLE)
+			res = curl_easy_getinfo_long_func(curl, info, va_arg(ap, long *));
+		else if ((info >= CURLINFO_STRING && info < CURLINFO_LONG) ||
+			 (info >= CURLINFO_SLIST && info < CURLINFO_SOCKET))
+			res = curl_easy_getinfo_pointer_func(curl, info, va_arg(ap, void **));
+		else if (info >= CURLINFO_OFF_T)
+			res = curl_easy_getinfo_off_t_func(curl, info, va_arg(ap, curl_off_t *));
+		else
+			die("%s:%d: TODO (info: %d)!", __FILE__, __LINE__, info);
+	)
+	va_end(ap);
+	return res;
+}
+
+#undef curl_easy_setopt
+CURLcode curl_easy_setopt(CURL *curl, CURLoption opt, ...)
+{
+	va_list ap;
+	CURLcode res;
+
+	va_start(ap, opt);
+	lazy_load_curl();
+	CURL_IGNORE_DEPRECATION(
+		if (opt >= CURLOPTTYPE_LONG && opt < CURLOPTTYPE_OBJECTPOINT)
+			res = curl_easy_setopt_long_func(curl, opt, va_arg(ap, long));
+		else if (opt >= CURLOPTTYPE_OBJECTPOINT && opt < CURLOPTTYPE_OFF_T)
+			res = curl_easy_setopt_pointer_func(curl, opt, va_arg(ap, void *));
+		else if (opt >= CURLOPTTYPE_OFF_T && opt < CURLOPTTYPE_BLOB)
+			res = curl_easy_setopt_off_t_func(curl, opt, va_arg(ap, curl_off_t));
+		else
+			die("%s:%d: TODO (opt: %d)!", __FILE__, __LINE__, opt);
+	)
+	va_end(ap);
+	return res;
+}
diff --git a/compat/mimalloc/LICENSE b/compat/mimalloc/LICENSE
new file mode 100644
index 0000000000..670b668a0c
--- /dev/null
+++ b/compat/mimalloc/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018-2021 Microsoft Corporation, Daan Leijen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/compat/mimalloc/alloc-aligned.c b/compat/mimalloc/alloc-aligned.c
new file mode 100644
index 0000000000..ce519a18c3
--- /dev/null
+++ b/compat/mimalloc/alloc-aligned.c
@@ -0,0 +1,306 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memset
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+// Fallback primitive aligned allocation -- split out for better codegen
+static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_fallback(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  mi_assert_internal(size <= PTRDIFF_MAX);
+  mi_assert_internal(alignment != 0 && _mi_is_power_of_two(alignment));
+
+  const uintptr_t align_mask = alignment - 1;  // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;
+
+  // use regular allocation if it is guaranteed to fit the alignment constraints
+  if (offset==0 && alignment<=padsize && padsize<=MI_MAX_ALIGN_GUARANTEE && (padsize&align_mask)==0) {
+    void* p = _mi_heap_malloc_zero(heap, size, zero);
+    mi_assert_internal(p == NULL || ((uintptr_t)p % alignment) == 0);
+    return p;
+  }
+
+  void* p;
+  size_t oversize;
+  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {
+    // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page)
+    // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the
+    // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down)
+    if mi_unlikely(offset != 0) {
+      // todo: cannot support offset alignment for very large alignments yet
+      #if MI_DEBUG > 0
+      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      #endif
+      return NULL;
+    }
+    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
+    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
+    // zero afterwards as only the area from the aligned_p may be committed!
+    if (p == NULL) return NULL;
+  }
+  else {
+    // otherwise over-allocate
+    oversize = size + alignment - 1;
+    p = _mi_heap_malloc_zero(heap, oversize, zero);
+    if (p == NULL) return NULL;
+  }
+
+  // .. and align within the allocation
+  const uintptr_t poffset = ((uintptr_t)p + offset) & align_mask;
+  const uintptr_t adjust  = (poffset == 0 ? 0 : alignment - poffset);
+  mi_assert_internal(adjust < alignment);
+  void* aligned_p = (void*)((uintptr_t)p + adjust);
+  if (aligned_p != p) {
+    mi_page_set_has_aligned(_mi_ptr_page(p), true);
+  }
+
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
+  mi_assert_internal(p == _mi_page_ptr_unalign(_mi_ptr_segment(aligned_p), _mi_ptr_page(aligned_p), aligned_p));
+  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
+  mi_assert_internal(mi_page_usable_block_size(_mi_ptr_page(p)) >= adjust + size);
+
+  // now zero the block if needed
+  if (zero && alignment > MI_ALIGNMENT_MAX) {
+    const ptrdiff_t diff = (uint8_t*)aligned_p - (uint8_t*)p;
+    const ptrdiff_t zsize = mi_page_usable_block_size(_mi_ptr_page(p)) - diff - MI_PADDING_SIZE;
+    if (zsize > 0) { _mi_memzero(aligned_p, zsize); }
+  }
+
+  #if MI_TRACK_ENABLED
+  if (p != aligned_p) {
+    mi_track_free_size(p, oversize);
+    mi_track_malloc(aligned_p, size, zero);
+  }
+  else {
+    mi_track_resize(aligned_p, oversize, size);
+  }
+  #endif
+  return aligned_p;
+}
+
+// Primitive aligned allocation
+static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept
+{
+  // note: we don't require `size > offset`, we just guarantee that the address at offset is aligned regardless of the allocated size.
+  mi_assert(alignment > 0);
+  if mi_unlikely(alignment == 0 || !_mi_is_power_of_two(alignment)) { // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation requires the alignment to be a power-of-two (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  /*
+  if mi_unlikely(alignment > MI_ALIGNMENT_MAX) {  // we cannot align at a boundary larger than this (or otherwise we cannot find segment headers)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation has a maximum alignment of %zu (size %zu, alignment %zu)\n", MI_ALIGNMENT_MAX, size, alignment);
+    #endif
+    return NULL;
+  }
+  */
+  if mi_unlikely(size > PTRDIFF_MAX) {          // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "aligned allocation request is too large (size %zu, alignment %zu)\n", size, alignment);
+    #endif
+    return NULL;
+  }
+  const uintptr_t align_mask = alignment-1;       // for any x, `(x & align_mask) == (x % alignment)`
+  const size_t padsize = size + MI_PADDING_SIZE;  // note: cannot overflow due to earlier size > PTRDIFF_MAX check
+
+  // try first if there happens to be a small block available with just the right alignment
+  if mi_likely(padsize <= MI_SMALL_SIZE_MAX && alignment <= padsize) {
+    mi_page_t* page = _mi_heap_get_free_small_page(heap, padsize);
+    const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
+    if mi_likely(page->free != NULL && is_aligned)
+    {
+      #if MI_STAT>1
+      mi_heap_stat_increase(heap, malloc, size);
+      #endif
+      void* p = _mi_page_malloc(heap, page, padsize, zero); // TODO: inline _mi_page_malloc
+      mi_assert_internal(p != NULL);
+      mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
+      mi_track_malloc(p,size,zero);
+      return p;
+    }
+  }
+  // fallback
+  return mi_heap_malloc_zero_aligned_at_fallback(heap, size, alignment, offset, zero);
+}
+
+
+// ------------------------------------------------------
+// Optimized mi_heap_malloc_aligned / mi_malloc_aligned
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, false);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  #if !MI_PADDING
+  // without padding, any small sized allocation is naturally aligned (see also `_mi_segment_page_start`)
+  if (!_mi_is_power_of_two(alignment)) return NULL;
+  if mi_likely(_mi_is_power_of_two(size) && size >= alignment && size <= MI_SMALL_SIZE_MAX)
+  #else
+  // with padding, we can only guarantee this for fixed alignments
+  if mi_likely((alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)))
+		&& size <= MI_SMALL_SIZE_MAX)
+  #endif
+  {
+    // fast path for common alignment and size
+    return mi_heap_malloc_small(heap, size);
+  }
+  else {
+    return mi_heap_malloc_aligned_at(heap, size, alignment, 0);
+  }
+}
+
+// ------------------------------------------------------
+// Aligned Allocation
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_zero_aligned_at(heap, size, alignment, offset, true);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(heap, size, alignment, 0);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_zalloc_aligned_at(heap, total, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(heap,count,size,alignment,0);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment);
+}
+
+
+// ------------------------------------------------------
+// Aligned re-allocation
+// ------------------------------------------------------
+
+static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  if (p == NULL) return mi_heap_malloc_zero_aligned_at(heap,newsize,alignment,offset,zero);
+  size_t size = mi_usable_size(p);
+  if (newsize <= size && newsize >= (size - (size / 2))
+      && (((uintptr_t)p + offset) % alignment) == 0) {
+    return p;  // reallocation still fits, is aligned and not more than 50% waste
+  }
+  else {
+    void* newp = mi_heap_malloc_aligned_at(heap,newsize,alignment,offset);
+    if (newp != NULL) {
+      if (zero && newsize > size) {
+	const mi_page_t* page = _mi_ptr_page(newp);
+	if (page->is_zero) {
+	  // already zero initialized
+	  mi_assert_expensive(mi_mem_is_zero(newp,newsize));
+	}
+	else {
+	  // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+	  size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+	  memset((uint8_t*)newp + start, 0, newsize - start);
+	}
+      }
+      _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
+      mi_free(p); // only free if successful
+    }
+    return newp;
+  }
+}
+
+static void* mi_heap_realloc_zero_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, bool zero) mi_attr_noexcept {
+  mi_assert(alignment > 0);
+  if (alignment <= sizeof(uintptr_t)) return _mi_heap_realloc_zero(heap,p,newsize,zero);
+  size_t offset = ((uintptr_t)p % alignment); // use offset of previous allocation (p can be NULL)
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,zero);
+}
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap,p,newsize,alignment,offset,false);
+}
+
+mi_decl_nodiscard void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap,p,newsize,alignment,false);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned_at(heap, p, newsize, alignment, offset, true);
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_zero_aligned(heap, p, newsize, alignment, true);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned_at(heap, p, total, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(newcount, size, &total)) return NULL;
+  return mi_heap_rezalloc_aligned(heap, p, total, alignment);
+}
+
+mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept {
+  return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment);
+}
+
+mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset);
+}
+
+mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept {
+  return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment);
+}
diff --git a/compat/mimalloc/alloc.c b/compat/mimalloc/alloc.c
new file mode 100644
index 0000000000..027421abf6
--- /dev/null
+++ b/compat/mimalloc/alloc.c
@@ -0,0 +1,1027 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for realpath() on Linux
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+
+#include <string.h>  // memset, strlen
+#include <stdlib.h>  // malloc, exit
+
+// ------------------------------------------------------
+// Allocation
+// ------------------------------------------------------
+
+// Fast allocation in a page: just pop from the free list.
+// Fall back to generic allocation only if the list is empty.
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept {
+  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
+  mi_block_t* const block = page->free;
+  if mi_unlikely(block == NULL) {
+    return _mi_malloc_generic(heap, size, zero, 0);
+  }
+  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
+  // pop from the free list
+  page->used++;
+  page->free = mi_block_next(page, block);
+  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+
+  // allow use of the block internally
+  // note: when tracking we need to avoid ever touching the MI_PADDING since
+  // that is tracked by valgrind etc. as non-accessible (through the red-zone, see `mimalloc-track.h`)
+  mi_track_mem_undefined(block, mi_page_usable_block_size(page));
+
+  // zero the block? note: we need to zero the full block size (issue #63)
+  if mi_unlikely(zero) {
+    mi_assert_internal(page->xblock_size != 0); // do not call with zero'ing for huge blocks (see _mi_malloc_generic)
+    const size_t zsize = (page->is_zero ? sizeof(block->next) + MI_PADDING_SIZE : page->xblock_size);
+    _mi_memzero_aligned(block, zsize - MI_PADDING_SIZE);
+  }
+
+#if (MI_DEBUG>0) && !MI_TRACK_ENABLED
+  if (!page->is_zero && !zero && !mi_page_is_huge(page)) {
+    memset(block, MI_DEBUG_UNINIT, mi_page_usable_block_size(page));
+  }
+#elif (MI_SECURE!=0)
+  if (!zero) { block->next = 0; } // don't leak internal data
+#endif
+
+#if (MI_STAT>0)
+  const size_t bsize = mi_page_usable_block_size(page);
+  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    mi_heap_stat_increase(heap, normal, bsize);
+    mi_heap_stat_counter_increase(heap, normal_count, 1);
+#if (MI_STAT>1)
+    const size_t bin = _mi_bin(bsize);
+    mi_heap_stat_increase(heap, normal_bins[bin], 1);
+#endif
+  }
+#endif
+
+#if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST) && !MI_TRACK_ENABLED
+  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
+  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
+  #if (MI_DEBUG>1)
+  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));  // note: re-enable since mi_page_usable_block_size may set noaccess
+  #endif
+  padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
+  padding->delta  = (uint32_t)(delta);
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)padding - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+    for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
+  }
+#endif
+
+  return block;
+}
+
+static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  mi_assert(heap != NULL);
+  #if MI_DEBUG
+  const uintptr_t tid = _mi_thread_id();
+  mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local
+  #endif
+  mi_assert(size <= MI_SMALL_SIZE_MAX);
+#if (MI_PADDING)
+  if (size == 0) {
+    size = sizeof(void*);
+  }
+#endif
+  mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE);
+  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE, zero);
+  mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+#if MI_STAT>1
+  if (p != NULL) {
+    if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+    mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+  }
+#endif
+  mi_track_malloc(p,size,zero);
+  return p;
+}
+
+// allocate a small block
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small_zero(heap, size, false);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small(mi_get_default_heap(), size);
+}
+
+// The main allocation function
+extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept {
+  if mi_likely(size <= MI_SMALL_SIZE_MAX) {
+    mi_assert_internal(huge_alignment == 0);
+    return mi_heap_malloc_small_zero(heap, size, zero);
+  }
+  else {
+    mi_assert(heap!=NULL);
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id());   // heaps are thread local
+    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment);  // note: size can overflow but it is detected in malloc_generic
+    mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
+    #if MI_STAT>1
+    if (p != NULL) {
+      if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); }
+      mi_heap_stat_increase(heap, malloc, mi_usable_size(p));
+    }
+    #endif
+    mi_track_malloc(p,size,zero);
+    return p;
+  }
+}
+
+extern inline void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept {
+  return _mi_heap_malloc_zero_ex(heap, size, zero, 0);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, false);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc(mi_get_default_heap(), size);
+}
+
+// zero initialized small block
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept {
+  return mi_heap_malloc_small_zero(mi_get_default_heap(), size, true);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept {
+  return _mi_heap_malloc_zero(heap, size, true);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept {
+  return mi_heap_zalloc(mi_get_default_heap(),size);
+}
+
+
+// ------------------------------------------------------
+// Check for double free in secure and debug mode
+// This is somewhat expensive so only enabled for secure mode 4
+// ------------------------------------------------------
+
+#if (MI_ENCODE_FREELIST && (MI_SECURE>=4 || MI_DEBUG!=0))
+// linear check if the free list contains a specific element
+static bool mi_list_contains(const mi_page_t* page, const mi_block_t* list, const mi_block_t* elem) {
+  while (list != NULL) {
+    if (elem==list) return true;
+    list = mi_block_next(page, list);
+  }
+  return false;
+}
+
+static mi_decl_noinline bool mi_check_is_double_freex(const mi_page_t* page, const mi_block_t* block) {
+  // The decoded value is in the same page (or NULL).
+  // Walk the free lists to verify positively if it is already freed
+  if (mi_list_contains(page, page->free, block) ||
+      mi_list_contains(page, page->local_free, block) ||
+      mi_list_contains(page, mi_page_thread_free(page), block))
+  {
+    _mi_error_message(EAGAIN, "double free detected of block %p with size %zu\n", block, mi_page_block_size(page));
+    return true;
+  }
+  return false;
+}
+
+#define mi_track_page(page,access)  { size_t psize; void* pstart = _mi_page_start(_mi_page_segment(page),page,&psize); mi_track_mem_##access( pstart, psize); }
+
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  bool is_double_free = false;
+  mi_block_t* n = mi_block_nextx(page, block, page->keys); // pretend it is freed, and get the decoded first field
+  if (((uintptr_t)n & (MI_INTPTR_SIZE-1))==0 &&  // quick check: aligned pointer?
+      (n==NULL || mi_is_in_same_page(block, n))) // quick check: in same page or NULL?
+  {
+    // Suspicous: decoded value a in block is in the same page (or NULL) -- maybe a double free?
+    // (continue in separate function to improve code generation)
+    is_double_free = mi_check_is_double_freex(page, block);
+  }
+  return is_double_free;
+}
+#else
+static inline bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  return false;
+}
+#endif
+
+// ---------------------------------------------------------------------------
+// Check for heap block overflow by setting up padding at the end of the block
+// ---------------------------------------------------------------------------
+
+#if (MI_PADDING>0) && defined(MI_ENCODE_FREELIST) && !MI_TRACK_ENABLED
+static bool mi_page_decode_padding(const mi_page_t* page, const mi_block_t* block, size_t* delta, size_t* bsize) {
+  *bsize = mi_page_usable_block_size(page);
+  const mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + *bsize);
+  mi_track_mem_defined(padding,sizeof(mi_padding_t));
+  *delta = padding->delta;
+  uint32_t canary = padding->canary;
+  uintptr_t keys[2];
+  keys[0] = page->keys[0];
+  keys[1] = page->keys[1];
+  bool ok = ((uint32_t)mi_ptr_encode(page,block,keys) == canary && *delta <= *bsize);
+  mi_track_mem_noaccess(padding,sizeof(mi_padding_t));
+  return ok;
+}
+
+// Return the exact usable size of a block.
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok); mi_assert_internal(delta <= bsize);
+  return (ok ? bsize - delta : 0);
+}
+
+static bool mi_verify_padding(const mi_page_t* page, const mi_block_t* block, size_t* size, size_t* wrong) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  *size = *wrong = bsize;
+  if (!ok) return false;
+  mi_assert_internal(bsize >= delta);
+  *size = bsize - delta;
+  if (!mi_page_is_huge(page)) {
+    uint8_t* fill = (uint8_t*)block + bsize - delta;
+    const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // check at most the first N padding bytes
+    mi_track_mem_defined(fill, maxpad);
+    for (size_t i = 0; i < maxpad; i++) {
+      if (fill[i] != MI_DEBUG_PADDING) {
+	*wrong = bsize - delta + i;
+	ok = false;
+	break;
+      }
+    }
+    mi_track_mem_noaccess(fill, maxpad);
+  }
+  return ok;
+}
+
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  size_t size;
+  size_t wrong;
+  if (!mi_verify_padding(page,block,&size,&wrong)) {
+    _mi_error_message(EFAULT, "buffer overflow in heap block %p of size %zu: write after %zu bytes\n", block, size, wrong );
+  }
+}
+
+// When a non-thread-local block is freed, it becomes part of the thread delayed free
+// list that is freed later by the owning heap. If the exact usable size is too small to
+// contain the pointer for the delayed list, then shrink the padding (by decreasing delta)
+// so it will later not trigger an overflow error in `mi_free_block`.
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  size_t bsize;
+  size_t delta;
+  bool ok = mi_page_decode_padding(page, block, &delta, &bsize);
+  mi_assert_internal(ok);
+  if (!ok || (bsize - delta) >= min_size) return;  // usually already enough space
+  mi_assert_internal(bsize >= min_size);
+  if (bsize < min_size) return;  // should never happen
+  size_t new_delta = (bsize - min_size);
+  mi_assert_internal(new_delta < bsize);
+  mi_padding_t* padding = (mi_padding_t*)((uint8_t*)block + bsize);
+  padding->delta = (uint32_t)new_delta;
+}
+#else
+static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+}
+
+static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(block);
+  return mi_page_usable_block_size(page);
+}
+
+static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size) {
+  MI_UNUSED(page);
+  MI_UNUSED(block);
+  MI_UNUSED(min_size);
+}
+#endif
+
+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  #if (MI_STAT < 2)
+  MI_UNUSED(block);
+  #endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);
+  #if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+  #endif
+  if (bsize <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+    #if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+    #endif
+  }
+  else if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, large, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  MI_UNUSED(page); MI_UNUSED(block);
+}
+#endif
+
+#if MI_HUGE_PAGE_ABANDON
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, large, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  MI_UNUSED(page);
+}
+#endif
+#endif
+
+// ------------------------------------------------------
+// Free
+// ------------------------------------------------------
+
+// multi-threaded free (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON)
+static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* block)
+{
+  // The padding check may access the non-thread-owned page for the key values.
+  // that is safe as these are constant and the page won't be freed (as the block is not freed yet).
+  mi_check_padding(page, block);
+  mi_padding_shrink(page, block, sizeof(mi_block_t));       // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection
+
+  // huge page segments are always abandoned and can be freed immediately
+  mi_segment_t* segment = _mi_page_segment(page);
+  if (segment->kind == MI_SEGMENT_HUGE) {
+    #if MI_HUGE_PAGE_ABANDON
+    // huge page segments are always abandoned and can be freed immediately
+    mi_stat_huge_free(page);
+    _mi_segment_huge_page_free(segment, page, block);
+    return;
+    #else
+    // huge pages are special as they occupy the entire segment
+    // as these are large we reset the memory occupied by the page so it is available to other threads
+    // (as the owning thread needs to actually free the memory later).
+    _mi_segment_huge_page_reset(segment, page, block);
+    #endif
+  }
+
+  #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED                    // note: when tracking, cannot use mi_usable_size with multi-threading
+  if (segment->kind != MI_SEGMENT_HUGE) {                   // not for huge segments as we just reset the content
+    memset(block, MI_DEBUG_FREED, mi_usable_size(block));
+  }
+  #endif
+
+  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
+  mi_thread_free_t tfreex;
+  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
+    if mi_unlikely(use_delayed) {
+      // unlikely: this only happens on the first concurrent free in a page that is in the full list
+      tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING);
+    }
+    else {
+      // usual: directly add to page thread_free list
+      mi_block_set_next(page, block, mi_tf_block(tfree));
+      tfreex = mi_tf_set_block(tfree,block);
+    }
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  if mi_unlikely(use_delayed) {
+    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
+    mi_assert_internal(heap != NULL);
+    if (heap != NULL) {
+      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+	mi_block_set_nextx(heap,block,dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+
+    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
+    do {
+      tfreex = tfree;
+      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
+      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+  }
+}
+
+// regular free
+static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
+{
+  // and push it on the free list
+  //const size_t bsize = mi_page_block_size(page);
+  if mi_likely(local) {
+    // owning thread can free a block directly
+    if mi_unlikely(mi_check_is_double_free(page, block)) return;
+    mi_check_padding(page, block);
+    #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+    if (!mi_page_is_huge(page)) {   // huge page content may be already decommitted
+      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+    }
+    #endif
+    mi_block_set_next(page, block, page->local_free);
+    page->local_free = block;
+    page->used--;
+    if mi_unlikely(mi_page_all_free(page)) {
+      _mi_page_retire(page);
+    }
+    else if mi_unlikely(mi_page_is_in_full(page)) {
+      _mi_page_unfull(page);
+    }
+  }
+  else {
+    _mi_free_block_mt(page,block);
+  }
+}
+
+
+// Adjust a block that was allocated aligned, to the actual start of the block in the page.
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) {
+  mi_assert_internal(page!=NULL && p!=NULL);
+  const size_t diff   = (uint8_t*)p - _mi_page_start(segment, page, NULL);
+  const size_t adjust = (diff % mi_page_block_size(page));
+  return (mi_block_t*)((uintptr_t)p - adjust);
+}
+
+
+void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept {
+  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_stat_free(page, block);                 // stat_free may access the padding
+  mi_track_free(p);
+  _mi_free_block(page, is_local, block);
+}
+
+// Get the segment data belonging to a pointer
+// This is just a single `and` in assembly but does further checks in debug mode
+// (and secure mode) if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg)
+{
+  MI_UNUSED(msg);
+  mi_assert(p != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+
+#if (MI_DEBUG>0)
+  if mi_unlikely(!mi_is_in_heap_region(p)) {
+  #if (MI_INTPTR_SIZE == 8 && defined(__linux__))
+    if (((uintptr_t)p >> 40) != 0x7F) { // linux tends to align large blocks above 0x7F000000000 (issue #640)
+  #else
+    {
+  #endif
+      _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+	"(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+      if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) {
+	_mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+      }
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  return segment;
+}
+
+// Free a block
+// fast path written carefully to prevent spilling on the stack
+void mi_free(void* p) mi_attr_noexcept
+{
+  if mi_unlikely(p == NULL) return;
+  mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  const bool          is_local= (_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id));
+  mi_page_t* const    page    = _mi_segment_page_of(segment, p);
+
+  if mi_likely(is_local) {                       // thread-local free?
+    if mi_likely(page->flags.full_aligned == 0)  // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+    {
+      mi_block_t* const block = (mi_block_t*)p;
+      if mi_unlikely(mi_check_is_double_free(page, block)) return;
+      mi_check_padding(page, block);
+      mi_stat_free(page, block);
+      #if (MI_DEBUG!=0) && !MI_TRACK_ENABLED
+      memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
+      #endif
+      mi_track_free(p);
+      mi_block_set_next(page, block, page->local_free);
+      page->local_free = block;
+      if mi_unlikely(--page->used == 0) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))
+	_mi_page_retire(page);
+      }
+    }
+    else {
+      // page is full or contains (inner) aligned blocks; use generic path
+      _mi_free_generic(segment, page, true, p);
+    }
+  }
+  else {
+    // not thread-local; use generic path
+    _mi_free_generic(segment, page, false, p);
+  }
+}
+
+// return true if successful
+bool _mi_free_delayed_block(mi_block_t* block) {
+  // get segment and page
+  const mi_segment_t* const segment = _mi_ptr_segment(block);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(_mi_thread_id() == segment->thread_id);
+  mi_page_t* const page = _mi_segment_page_of(segment, block);
+
+  // Clear the no-delayed flag so delayed freeing is used again for this page.
+  // This must be done before collecting the free lists on this page -- otherwise
+  // some blocks may end up in the page `thread_free` list with no blocks in the
+  // heap `thread_delayed_free` list which may cause the page to be never freed!
+  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
+  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
+    return false;
+  }
+
+  // collect all other non-local frees to ensure up-to-date `used` count
+  _mi_page_free_collect(page, false);
+
+  // and free the block (possibly freeing the page as well since used is updated)
+  _mi_free_block(page, true, block);
+  return true;
+}
+
+// Bytes available in a block
+mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept {
+  const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p);
+  const size_t size = mi_page_usable_size_of(page, block);
+  const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block;
+  mi_assert_internal(adjust >= 0 && (size_t)adjust <= size);
+  return (size - adjust);
+}
+
+static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  if (p == NULL) return 0;
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg);
+  const mi_page_t* const page = _mi_segment_page_of(segment, p);
+  if mi_likely(!mi_page_has_aligned(page)) {
+    const mi_block_t* block = (const mi_block_t*)p;
+    return mi_page_usable_size_of(page, block);
+  }
+  else {
+    // split out to separate routine for improved code generation
+    return mi_page_usable_aligned_size_of(segment, page, p);
+  }
+}
+
+mi_decl_nodiscard size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+
+
+// ------------------------------------------------------
+// Allocation extensions
+// ------------------------------------------------------
+
+void mi_free_size(void* p, size_t size) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(size);
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
+  mi_free(p);
+}
+
+void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free_size(p,size);
+}
+
+void mi_free_aligned(void* p, size_t alignment) mi_attr_noexcept {
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert(((uintptr_t)p % alignment) == 0);
+  mi_free(p);
+}
+
+mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count,size,&total)) return NULL;
+  return mi_heap_zalloc(heap,total);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_calloc(mi_get_default_heap(),count,size);
+}
+
+// Uninitialized `calloc`
+mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_malloc(heap, total);
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_mallocn(mi_get_default_heap(),count,size);
+}
+
+// Expand (or shrink) in place (or fail)
+void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
+  #if MI_PADDING
+  // we do not shrink/expand with padding enabled
+  MI_UNUSED(p); MI_UNUSED(newsize);
+  return NULL;
+  #else
+  if (p == NULL) return NULL;
+  const size_t size = _mi_usable_size(p,"mi_expand");
+  if (newsize > size) return NULL;
+  return p; // it fits
+  #endif
+}
+
+void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept {
+  // if p == NULL then behave as malloc.
+  // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)).
+  // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.)
+  const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0)
+  if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) {  // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0)
+    // todo: adjust potential padding to reflect the new size?
+    mi_track_free_size(p, size);
+    mi_track_malloc(p,newsize,true);
+    return p;  // reallocation still fits and not more than 50% waste
+  }
+  void* newp = mi_heap_malloc(heap,newsize);
+  if mi_likely(newp != NULL) {
+    if (zero && newsize > size) {
+      // also set last word in the previous allocation to zero to ensure any padding is zero-initialized
+      const size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
+      memset((uint8_t*)newp + start, 0, newsize - start);
+    }
+    if mi_likely(p != NULL) {
+      if mi_likely(_mi_is_aligned(p, sizeof(uintptr_t))) {  // a client may pass in an arbitrary pointer `p`..
+	const size_t copysize = (newsize > size ? size : newsize);
+	mi_track_mem_defined(p,copysize);  // _mi_useable_size may be too large for byte precise memory tracking..
+	_mi_memcpy_aligned(newp, p, copysize);
+      }
+      mi_free(p); // only free the original pointer if successful
+    }
+  }
+  return newp;
+}
+
+mi_decl_nodiscard void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, false);
+}
+
+mi_decl_nodiscard void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_realloc(heap, p, total);
+}
+
+
+// Reallocate but free `p` on errors
+mi_decl_nodiscard void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  void* newp = mi_heap_realloc(heap, p, newsize);
+  if (newp==NULL && p!=NULL) mi_free(p);
+  return newp;
+}
+
+mi_decl_nodiscard void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept {
+  return _mi_heap_realloc_zero(heap, p, newsize, true);
+}
+
+mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept {
+  size_t total;
+  if (mi_count_size_overflow(count, size, &total)) return NULL;
+  return mi_heap_rezalloc(heap, p, total);
+}
+
+
+mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_realloc(mi_get_default_heap(),p,newsize);
+}
+
+mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_reallocn(mi_get_default_heap(),p,count,size);
+}
+
+// Reallocate but free `p` on errors
+mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_reallocf(mi_get_default_heap(),p,newsize);
+}
+
+mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept {
+  return mi_heap_rezalloc(mi_get_default_heap(), p, newsize);
+}
+
+mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept {
+  return mi_heap_recalloc(mi_get_default_heap(), p, count, size);
+}
+
+
+
+// ------------------------------------------------------
+// strdup, strndup, and realpath
+// ------------------------------------------------------
+
+// `strdup` using mi_malloc
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  size_t n = strlen(s);
+  char* t = (char*)mi_heap_malloc(heap,n+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, n);
+  t[n] = 0;
+  return t;
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept {
+  return mi_heap_strdup(mi_get_default_heap(), s);
+}
+
+// `strndup` using mi_malloc
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept {
+  if (s == NULL) return NULL;
+  const char* end = (const char*)memchr(s, 0, n);  // find end of string in the first `n` characters (returns NULL if not found)
+  const size_t m = (end != NULL ? (size_t)(end - s) : n);  // `m` is the minimum of `n` or the end-of-string
+  mi_assert_internal(m <= n);
+  char* t = (char*)mi_heap_malloc(heap, m+1);
+  if (t == NULL) return NULL;
+  _mi_memcpy(t, s, m);
+  t[m] = 0;
+  return t;
+}
+
+mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
+  return mi_heap_strndup(mi_get_default_heap(),s,n);
+}
+
+#ifndef __wasi__
+// `realpath` using mi_malloc
+#ifdef _WIN32
+#ifndef PATH_MAX
+#define PATH_MAX MAX_PATH
+#endif
+#include <windows.h>
+mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  // todo: use GetFullPathNameW to allow longer file names
+  char buf[PATH_MAX];
+  DWORD res = GetFullPathNameA(fname, PATH_MAX, (resolved_name == NULL ? buf : resolved_name), NULL);
+  if (res == 0) {
+    errno = GetLastError(); return NULL;
+  }
+  else if (res > PATH_MAX) {
+    errno = EINVAL; return NULL;
+  }
+  else if (resolved_name != NULL) {
+    return resolved_name;
+  }
+  else {
+    return mi_heap_strndup(heap, buf, PATH_MAX);
+  }
+}
+#else
+/*
+#include <unistd.h>  // pathconf
+static size_t mi_path_max(void) {
+  static size_t path_max = 0;
+  if (path_max <= 0) {
+    long m = pathconf("/",_PC_PATH_MAX);
+    if (m <= 0) path_max = 4096;      // guess
+    else if (m < 256) path_max = 256; // at least 256
+    else path_max = m;
+  }
+  return path_max;
+}
+*/
+char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
+  if (resolved_name != NULL) {
+    return realpath(fname,resolved_name);
+  }
+  else {
+    char* rname = realpath(fname, NULL);
+    if (rname == NULL) return NULL;
+    char* result = mi_heap_strdup(heap, rname);
+    free(rname);  // use regular free! (which may be redirected to our free but that's ok)
+    return result;
+  }
+  /*
+    const size_t n  = mi_path_max();
+    char* buf = (char*)mi_malloc(n+1);
+    if (buf == NULL) {
+      errno = ENOMEM;
+      return NULL;
+    }
+    char* rname  = realpath(fname,buf);
+    char* result = mi_heap_strndup(heap,rname,n); // ok if `rname==NULL`
+    mi_free(buf);
+    return result;
+  }
+  */
+}
+#endif
+
+mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept {
+  return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name);
+}
+#endif
+
+/*-------------------------------------------------------
+C++ new and new_aligned
+The standard requires calling into `get_new_handler` and
+throwing the bad_alloc exception on failure. If we compile
+with a C++ compiler we can implement this precisely. If we
+use a C compiler we cannot throw a `bad_alloc` exception
+but we call `exit` instead (i.e. not returning).
+-------------------------------------------------------*/
+
+#ifdef __cplusplus
+#include <new>
+static bool mi_try_new_handler(bool nothrow) {
+  #if defined(_MSC_VER) || (__cplusplus >= 201103L)
+    std::new_handler h = std::get_new_handler();
+  #else
+    std::new_handler h = std::set_new_handler();
+    std::set_new_handler(h);
+  #endif
+  if (h==NULL) {
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    if (!nothrow) {
+      throw std::bad_alloc();
+    }
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#else
+typedef void (*std_new_handler_t)(void);
+
+#if (defined(__GNUC__) || (defined(__clang__) && !defined(_MSC_VER)))  // exclude clang-cl, see issue #631
+std_new_handler_t __attribute__((weak)) _ZSt15get_new_handlerv(void) {
+  return NULL;
+}
+static std_new_handler_t mi_get_new_handler(void) {
+  return _ZSt15get_new_handlerv();
+}
+#else
+// note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
+static std_new_handler_t mi_get_new_handler() {
+  return NULL;
+}
+#endif
+
+static bool mi_try_new_handler(bool nothrow) {
+  std_new_handler_t h = mi_get_new_handler();
+  if (h==NULL) {
+    _mi_error_message(ENOMEM, "out of memory in 'new'");
+    if (!nothrow) {
+      abort();  // cannot throw in plain C, use abort
+    }
+    return false;
+  }
+  else {
+    h();
+    return true;
+  }
+}
+#endif
+
+static mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t size, bool nothrow ) {
+  void* p = NULL;
+  while(p == NULL && mi_try_new_handler(nothrow)) {
+    p = mi_heap_malloc(heap,size);
+  }
+  return p;
+}
+
+static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) {
+  return mi_heap_try_new(mi_get_default_heap(), size, nothrow);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new(mi_heap_t* heap, size_t size) {
+  void* p = mi_heap_malloc(heap,size);
+  if mi_unlikely(p == NULL) return mi_heap_try_new(heap, size, false);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) {
+  return mi_heap_alloc_new(mi_get_default_heap(), size);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict extern inline void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(count, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_heap_alloc_new(heap,total);
+  }
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) {
+  return mi_heap_alloc_new_n(mi_get_default_heap(), size, count);
+}
+
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_nothrow(size_t size) mi_attr_noexcept {
+  void* p = mi_malloc(size);
+  if mi_unlikely(p == NULL) return mi_try_new(size, true);
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(false));
+  return p;
+}
+
+mi_decl_nodiscard mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept {
+  void* p;
+  do {
+    p = mi_malloc_aligned(size, alignment);
+  }
+  while(p == NULL && mi_try_new_handler(true));
+  return p;
+}
+
+mi_decl_nodiscard void* mi_new_realloc(void* p, size_t newsize) {
+  void* q;
+  do {
+    q = mi_realloc(p, newsize);
+  } while (q == NULL && mi_try_new_handler(false));
+  return q;
+}
+
+mi_decl_nodiscard void* mi_new_reallocn(void* p, size_t newcount, size_t size) {
+  size_t total;
+  if mi_unlikely(mi_count_size_overflow(newcount, size, &total)) {
+    mi_try_new_handler(false);  // on overflow we invoke the try_new_handler once to potentially throw std::bad_alloc
+    return NULL;
+  }
+  else {
+    return mi_new_realloc(p, total);
+  }
+}
+
+// ------------------------------------------------------
+// ensure explicit external inline definitions are emitted!
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+void* _mi_externs[] = {
+  (void*)&_mi_page_malloc,
+  (void*)&_mi_heap_malloc_zero,
+  (void*)&_mi_heap_malloc_zero_ex,
+  (void*)&mi_malloc,
+  (void*)&mi_malloc_small,
+  (void*)&mi_zalloc_small,
+  (void*)&mi_heap_malloc,
+  (void*)&mi_heap_zalloc,
+  (void*)&mi_heap_malloc_small,
+  (void*)&mi_heap_alloc_new,
+  (void*)&mi_heap_alloc_new_n
+};
+#endif
diff --git a/compat/mimalloc/arena.c b/compat/mimalloc/arena.c
new file mode 100644
index 0000000000..232bc05a74
--- /dev/null
+++ b/compat/mimalloc/arena.c
@@ -0,0 +1,536 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+"Arenas" are fixed area's of OS memory from which we can allocate
+large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB).
+In contrast to the rest of mimalloc, the arenas are shared between
+threads and need to be accessed using atomic operations.
+
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+or direct OS memory reservations -- otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).
+
+The arena allocation needs to be thread safe and we use an atomic bitmap to allocate.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <errno.h> // ENOMEM
+
+#include "bitmap.h"  // atomic bitmap
+
+
+// os.c
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
+
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
+void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
+
+bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+// Block info: bit 0 contains the `in_use` bit, the upper bits the
+// size in count of arena blocks.
+typedef uintptr_t mi_block_info_t;
+#define MI_ARENA_BLOCK_SIZE   (MI_SEGMENT_SIZE)        // 64MiB  (must be at least MI_SEGMENT_ALIGN)
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 32MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 126 (since we use 7 bits in the memid and an arena index + 1)
+
+// A memory arena descriptor
+typedef struct mi_arena_s {
+  mi_arena_id_t id;                       // arena id; 0 for non-specific
+  bool     exclusive;                     // only allow allocations if specifically for this arena
+  _Atomic(uint8_t*) start;                // the start of the memory area
+  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  int      numa_node;                     // associated NUMA node
+  bool     is_zero_init;                  // is the arena zero initialized?
+  bool     allow_decommit;                // is decommit allowed? if true, is_large should be false and blocks_committed != NULL
+  bool     is_large;                      // large- or huge OS pages (always committed)
+  _Atomic(size_t) search_idx;             // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t* blocks_committed;    // are the blocks committed? (can be NULL for memory that cannot be decommitted)
+  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
+} mi_arena_t;
+
+
+// The available arenas
+static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
+static mi_decl_cache_align _Atomic(size_t)      mi_arena_count; // = 0
+
+
+/* -----------------------------------------------------------
+  Arena id's
+  0 is used for non-arena's (like OS memory)
+  id = arena_index + 1
+----------------------------------------------------------- */
+
+static size_t mi_arena_id_index(mi_arena_id_t id) {
+  return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1);
+}
+
+static mi_arena_id_t mi_arena_id_create(size_t arena_index) {
+  mi_assert_internal(arena_index < MI_MAX_ARENAS);
+  mi_assert_internal(MI_MAX_ARENAS <= 126);
+  int id = (int)arena_index + 1;
+  mi_assert_internal(id >= 1 && id <= 127);
+  return id;
+}
+
+mi_arena_id_t _mi_arena_id_none(void) {
+  return 0;
+}
+
+static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) {
+  return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) ||
+	  (arena_id == req_arena_id));
+}
+
+
+/* -----------------------------------------------------------
+  Arena allocations get a memory id where the lower 8 bits are
+  the arena id, and the upper bits the block index.
+----------------------------------------------------------- */
+
+// Use `0` as a special id for direct OS allocated memory.
+#define MI_MEMID_OS   0
+
+static size_t mi_arena_memid_create(mi_arena_id_t id, bool exclusive, mi_bitmap_index_t bitmap_index) {
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
+  mi_assert_internal(id >= 0 && id <= 0x7F);
+  return ((bitmap_index << 8) | ((uint8_t)id & 0x7F) | (exclusive ? 0x80 : 0));
+}
+
+static bool mi_arena_memid_indices(size_t arena_memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  *bitmap_index = (arena_memid >> 8);
+  mi_arena_id_t id = (int)(arena_memid & 0x7F);
+  *arena_index = mi_arena_id_index(id);
+  return ((arena_memid & 0x80) != 0);
+}
+
+bool _mi_arena_memid_is_suitable(size_t arena_memid, mi_arena_id_t request_arena_id) {
+  mi_arena_id_t id = (int)(arena_memid & 0x7F);
+  bool exclusive = ((arena_memid & 0x80) != 0);
+  return mi_arena_id_is_suitable(id, exclusive, request_arena_id);
+}
+
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+/* -----------------------------------------------------------
+  Thread safe allocation in an arena
+----------------------------------------------------------- */
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
+{
+  size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx);  // start from last search; ok to be relaxed as the exact start does not matter
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
+    mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx));  // start search from found location next time around
+    return true;
+  };
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static mi_decl_noinline void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
+						   bool* commit, bool* large, bool* is_pinned, bool* is_zero,
+						   mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  MI_UNUSED(arena_index);
+  mi_assert_internal(mi_arena_id_index(arena->id) == arena_index);
+  if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL;
+
+  mi_bitmap_index_t bitmap_index;
+  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid     = mi_arena_memid_create(arena->id, arena->exclusive, bitmap_index);
+  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large     = arena->is_large;
+  *is_pinned = (arena->is_large || !arena->allow_decommit);
+  if (arena->blocks_committed == NULL) {
+    // always committed
+    *commit = true;
+  }
+  else if (*commit) {
+    // arena not committed as a whole, but commit requested: ensure commit now
+    bool any_uncommitted;
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
+      bool commit_zero;
+      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+  return p;
+}
+
+// allocate from an arena with fallback to the OS
+static mi_decl_noinline void* mi_arena_allocate(int numa_node, size_t size, size_t alignment, bool* commit, bool* large,
+						bool* is_pinned, bool* is_zero,
+						mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld )
+{
+  MI_UNUSED_RELEASE(alignment);
+  mi_assert_internal(alignment <= MI_SEGMENT_ALIGN);
+  const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
+  const size_t bcount = mi_block_count_of_size(size);
+  if mi_likely(max_arena == 0) return NULL;
+  mi_assert_internal(size <= bcount * MI_ARENA_BLOCK_SIZE);
+
+  size_t arena_index = mi_arena_id_index(req_arena_id);
+  if (arena_index < MI_MAX_ARENAS) {
+    // try a specific arena if requested
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
+    if ((arena != NULL) &&
+	(arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
+	(*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+    {
+      void* p = mi_arena_alloc_from(arena, arena_index, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+      mi_assert_internal((uintptr_t)p % alignment == 0);
+      if (p != NULL) return p;
+    }
+  }
+  else {
+    // try numa affine allocation
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      if (arena == NULL) break; // end reached
+      if ((arena->numa_node < 0 || arena->numa_node == numa_node) && // numa local?
+	  (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+	void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+	mi_assert_internal((uintptr_t)p % alignment == 0);
+	if (p != NULL) return p;
+      }
+    }
+
+    // try from another numa node instead..
+    for (size_t i = 0; i < max_arena; i++) {
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      if (arena == NULL) break; // end reached
+      if ((arena->numa_node >= 0 && arena->numa_node != numa_node) && // not numa local!
+	  (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+	void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+	mi_assert_internal((uintptr_t)p % alignment == 0);
+	if (p != NULL) return p;
+      }
+    }
+  }
+  return NULL;
+}
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
+			      mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid   = MI_MEMID_OS;
+  *is_zero = false;
+  *is_pinned = false;
+
+  bool default_large = false;
+  if (large == NULL) large = &default_large;   // ensure `large != NULL`
+  const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+  // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
+  if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) {
+    void* p = mi_arena_allocate(numa_node, size, alignment, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+    if (p != NULL) return p;
+  }
+
+  // finally, fall back to the OS
+  if (mi_option_is_enabled(mi_option_limit_os_alloc) || req_arena_id != _mi_arena_id_none()) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  *is_zero = true;
+  *memid   = MI_MEMID_OS;
+  void* p = _mi_os_alloc_aligned_offset(size, alignment, align_offset, *commit, large, tld->stats);
+  if (p != NULL) { *is_pinned = *large; }
+  return p;
+}
+
+void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, large, is_pinned, is_zero, req_arena_id, memid, tld);
+}
+
+void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) {
+  if (size != NULL) *size = 0;
+  size_t arena_index = mi_arena_id_index(arena_id);
+  if (arena_index >= MI_MAX_ARENAS) return NULL;
+  mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[arena_index]);
+  if (arena == NULL) return NULL;
+  if (size != NULL) *size = arena->block_count * MI_ARENA_BLOCK_SIZE;
+  return arena->start;
+}
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+
+void _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+
+  if (memid == MI_MEMID_OS) {
+    // was a direct OS allocation, pass through
+    _mi_os_free_aligned(p, size, alignment, align_offset, all_committed, stats);
+  }
+  else {
+    // allocated in an arena
+    mi_assert_internal(align_offset == 0);
+    size_t arena_idx;
+    size_t bitmap_idx;
+    mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+    // checks
+    if (arena == NULL) {
+      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    // potentially decommit
+    if (!arena->allow_decommit || arena->blocks_committed == NULL) {
+      mi_assert_internal(all_committed); // note: may be not true as we may "pretend" to be not committed (in segment.c)
+    }
+    else {
+      mi_assert_internal(arena->blocks_committed != NULL);
+      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
+      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+    }
+    // and make it available to others again
+    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
+    if (!all_inuse) {
+      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+}
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal(arena->block_count > 0);
+  if (arena_id != NULL) *arena_id = -1;
+
+  size_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
+    return false;
+  }
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
+  arena->id = mi_arena_id_create(i);
+  if (arena_id != NULL) *arena_id = arena->id;
+  return true;
+}
+
+bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  if (size < MI_ARENA_BLOCK_SIZE) return false;
+
+  if (is_large) {
+    mi_assert_internal(is_committed);
+    is_committed = true;
+  }
+
+  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
+  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
+  const size_t bitmaps = (is_committed ? 2 : 3);
+  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
+  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  if (arena == NULL) return false;
+
+  arena->id = _mi_arena_id_none();
+  arena->exclusive = exclusive;
+  arena->block_count = bcount;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)start;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->is_zero_init = is_zero;
+  arena->allow_decommit = !is_large && !is_committed; // only allow decommit for initially uncommitted memory
+  arena->search_idx   = 0;
+  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
+  arena->blocks_committed = (!arena->allow_decommit ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
+  // the bitmaps are already zero initialized due to os_alloc
+  // initialize committed bitmap?
+  if (arena->blocks_committed != NULL && is_committed) {
+    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
+  }
+  // and claim leftover blocks if needed (so we never allocate there)
+  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  mi_assert_internal(post >= 0);
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+  }
+
+  return mi_arena_add(arena, arena_id);
+
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept
+{
+  if (arena_id != NULL) *arena_id = _mi_arena_id_none();
+  size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block
+  bool large = allow_large;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
+  if (start==NULL) return ENOMEM;
+  if (!mi_manage_os_memory_ex(start, size, (large || commit), large, true, -1, exclusive, arena_id)) {
+    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  return 0;
+}
+
+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept {
+  return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false, NULL);
+}
+
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept {
+  return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL);
+}
+
+
+/* -----------------------------------------------------------
+  Debugging
+----------------------------------------------------------- */
+
+static size_t mi_debug_show_bitmap(const char* prefix, mi_bitmap_field_t* fields, size_t field_count ) {
+  size_t inuse_count = 0;
+  for (size_t i = 0; i < field_count; i++) {
+    char buf[MI_BITMAP_FIELD_BITS + 1];
+    uintptr_t field = mi_atomic_load_relaxed(&fields[i]);
+    for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++) {
+      bool inuse = ((((uintptr_t)1 << bit) & field) != 0);
+      if (inuse) inuse_count++;
+      buf[MI_BITMAP_FIELD_BITS - 1 - bit] = (inuse ? 'x' : '.');
+    }
+    buf[MI_BITMAP_FIELD_BITS] = 0;
+    _mi_verbose_message("%s%s\n", prefix, buf);
+  }
+  return inuse_count;
+}
+
+void mi_debug_show_arenas(void) mi_attr_noexcept {
+  size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
+  for (size_t i = 0; i < max_arenas; i++) {
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+    if (arena == NULL) break;
+    size_t inuse_count = 0;
+    _mi_verbose_message("arena %zu: %zu blocks with %zu fields\n", i, arena->block_count, arena->field_count);
+    inuse_count += mi_debug_show_bitmap("  ", arena->blocks_inuse, arena->field_count);
+    _mi_verbose_message("  blocks in use ('x'): %zu\n", inuse_count);
+  }
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
+  if (arena_id != NULL) *arena_id = -1;
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages);
+
+  if (!mi_manage_os_memory_ex(p, hsize, true, true, true, numa_node, exclusive, arena_id)) {
+    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
+    return ENOMEM;
+  }
+  return 0;
+}
+
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL);
+}
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
+
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  MI_UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
diff --git a/compat/mimalloc/bitmap.c b/compat/mimalloc/bitmap.c
new file mode 100644
index 0000000000..56a8c3057b
--- /dev/null
+++ b/compat/mimalloc/bitmap.c
@@ -0,0 +1,414 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  if (count == 0) return 0;
+  return ((((size_t)1 << count) - 1) << bitidx);
+}
+
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+inline bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map  = mi_atomic_load_relaxed(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const size_t mask = mi_bitmap_mask_(count, 0);
+  const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_FAST_BITSCAN
+  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  size_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    const size_t mapm = map & m;
+    if (mapm == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const size_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
+	// no success, another thread claimed concurrently.. keep going (with updated `map`)
+	continue;
+      }
+      else {
+	// success, we claimed the bits!
+	*bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+	return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_FAST_BITSCAN
+      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
+bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields,
+	    const size_t start_field_idx, const size_t count,
+	    mi_bitmap_pred_fun_t pred_fun, void* pred_arg,
+	    mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      if (pred_fun == NULL || pred_fun(*bitmap_idx, pred_arg)) {
+	return true;
+      }
+      // predicate returned false, unclaim and look further
+      _mi_bitmap_unclaim(bitmap, bitmap_fields, count, *bitmap_idx);
+    }
+  }
+  return false;
+}
+
+/*
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
+}
+*/
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const size_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields);
+  size_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
+}
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Try to atomically claim a sequence of `count` bits starting from the field
+// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+
+  // check initial trailing zeros
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t map = mi_atomic_load_relaxed(field);
+  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
+  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
+  if (initial == 0)     return false;
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
+  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
+
+  // scan ahead
+  size_t found = initial;
+  size_t mask = 0;     // mask bits for the final field
+  while(found < count) {
+    field++;
+    map = mi_atomic_load_relaxed(field);
+    const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mask = mi_bitmap_mask_(mask_bits, 0);
+    if ((map & mask) != 0) return false;
+    found += mask_bits;
+  }
+  mi_assert_internal(field < &bitmap[bitmap_fields]);
+
+  // found range of zeros up to the final field; mask contains mask in the final field
+  // now claim it atomically
+  mi_bitmap_field_t* const final_field = field;
+  const size_t final_mask = mask;
+  mi_bitmap_field_t* const initial_field = &bitmap[idx];
+  const size_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+
+  // initial field
+  size_t newmap;
+  field = initial_field;
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | initial_mask;
+    if ((map & initial_mask) != 0) { goto rollback; };
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // intermediate fields
+  while (++field < final_field) {
+    newmap = MI_BITMAP_FIELD_FULL;
+    map = 0;
+    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+  }
+
+  // final field
+  mi_assert_internal(field == final_field);
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | final_mask;
+    if ((map & final_mask) != 0) { goto rollback; }
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // claimed!
+  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+  return true;
+
+rollback:
+  // roll back intermediate fields
+  while (--field > initial_field) {
+    newmap = 0;
+    map = MI_BITMAP_FIELD_FULL;
+    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
+    mi_atomic_store_release(field, newmap);
+  }
+  if (field == initial_field) {
+    map = mi_atomic_load_relaxed(field);
+    do {
+      mi_assert_internal((map & initial_mask) == initial_mask);
+      newmap = map & ~initial_mask;
+    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  }
+  // retry? (we make a recursive call instead of goto to be able to use const declarations)
+  if (retries < 4) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+  }
+  else {
+    return false;
+  }
+}
+
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  mi_assert_internal(count > 0);
+  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    // try to claim inside the field
+    if (count <= MI_BITMAP_FIELD_BITS) {
+      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+	return true;
+      }
+    }
+    // try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper for masks across fields; returns the mid count, post_mask may be 0
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) {
+  MI_UNUSED_RELEASE(bitmap_fields);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) {
+    *pre_mask = mi_bitmap_mask_(count, bitidx);
+    *mid_mask = 0;
+    *post_mask = 0;
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
+    return 0;
+  }
+  else {
+    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
+    mi_assert_internal(pre_bits < count);
+    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
+    count -= pre_bits;
+    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
+    *mid_mask = MI_BITMAP_FIELD_FULL;
+    count %= MI_BITMAP_FIELD_BITS;
+    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
+    return mid_count;
+  }
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_one = true;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  if ((prev & pre_mask) != pre_mask) all_one = false;
+  while(mid_count-- > 0) {
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
+    if ((prev & mid_mask) != mid_mask) all_one = false;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);
+    if ((prev & post_mask) != post_mask) all_one = false;
+  }
+  return all_one;
+}
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_zero = true;
+  bool any_zero = false;
+  _Atomic(size_t)*field = &bitmap[idx];
+  size_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  if ((prev & pre_mask) != 0) all_zero = false;
+  if ((prev & pre_mask) != pre_mask) any_zero = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_or_acq_rel(field++, mid_mask);
+    if ((prev & mid_mask) != 0) all_zero = false;
+    if ((prev & mid_mask) != mid_mask) any_zero = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_or_acq_rel(field, post_mask);
+    if ((prev & post_mask) != 0) all_zero = false;
+    if ((prev & post_mask) != post_mask) any_zero = true;
+  }
+  if (pany_zero != NULL) *pany_zero = any_zero;
+  return all_zero;
+}
+
+
+// Returns `true` if all `count` bits were 1.
+// `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  size_t pre_mask;
+  size_t mid_mask;
+  size_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_ones = true;
+  bool any_ones = false;
+  mi_bitmap_field_t* field = &bitmap[idx];
+  size_t prev = mi_atomic_load_relaxed(field++);
+  if ((prev & pre_mask) != pre_mask) all_ones = false;
+  if ((prev & pre_mask) != 0) any_ones = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_load_relaxed(field++);
+    if ((prev & mid_mask) != mid_mask) all_ones = false;
+    if ((prev & mid_mask) != 0) any_ones = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_load_relaxed(field);
+    if ((prev & post_mask) != post_mask) all_ones = false;
+    if ((prev & post_mask) != 0) any_ones = true;
+  }
+  if (pany_ones != NULL) *pany_ones = any_ones;
+  return all_ones;
+}
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
diff --git a/compat/mimalloc/bitmap.h b/compat/mimalloc/bitmap.h
new file mode 100644
index 0000000000..e92f07503f
--- /dev/null
+++ b/compat/mimalloc/bitmap.h
@@ -0,0 +1,111 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`size_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_SIZE_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((size_t)0))   // all bits set
+
+// An atomic bitmap of `size_t` fields
+typedef _Atomic(size_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*  mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create_from_bit(size_t full_bitidx) {
+  return mi_bitmap_index_create(full_bitidx / MI_BITMAP_FIELD_BITS, full_bitidx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Like _mi_bitmap_try_find_from_claim but with an extra predicate that must be fullfilled
+typedef bool (mi_cdecl *mi_bitmap_pred_fun_t)(mi_bitmap_index_t bitmap_idx, void* pred_arg);
+bool _mi_bitmap_try_find_from_claim_pred(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_pred_fun_t pred_fun, void* pred_arg, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+#endif
diff --git a/compat/mimalloc/heap.c b/compat/mimalloc/heap.c
new file mode 100644
index 0000000000..01d100a0b1
--- /dev/null
+++ b/compat/mimalloc/heap.c
@@ -0,0 +1,602 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset, memcpy
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Helpers
+----------------------------------------------------------- */
+
+// return `true` if ok, `false` to break
+typedef bool (heap_page_visitor_fun)(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2);
+
+// Visit all pages in a heap; returns `false` if break was called.
+static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void* arg1, void* arg2)
+{
+  if (heap==NULL || heap->page_count==0) return 0;
+
+  // visit all pages
+  #if MI_DEBUG>1
+  size_t total = heap->page_count;
+  #endif
+  size_t count = 0;
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_t* page = pq->first;
+    while(page != NULL) {
+      mi_page_t* next = page->next; // save next in case the page gets removed from the queue
+      mi_assert_internal(mi_page_heap(page) == heap);
+      count++;
+      if (!fn(heap, pq, page, arg1, arg2)) return false;
+      page = next; // and continue
+    }
+  }
+  mi_assert_internal(count == total);
+  return true;
+}
+
+
+#if MI_DEBUG>=2
+static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(pq);
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_internal(segment->thread_id == heap->thread_id);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return true;
+}
+#endif
+#if MI_DEBUG>=3
+static bool mi_heap_is_valid(mi_heap_t* heap) {
+  mi_assert_internal(heap!=NULL);
+  mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL);
+  return true;
+}
+#endif
+
+
+
+
+/* -----------------------------------------------------------
+  "Collect" pages by migrating `local_free` and `thread_free`
+  lists and freeing empty pages. This is done when a thread
+  stops (and in that case abandons pages if there are still
+  blocks alive)
+----------------------------------------------------------- */
+
+typedef enum mi_collect_e {
+  MI_NORMAL,
+  MI_FORCE,
+  MI_ABANDON
+} mi_collect_t;
+
+
+static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg_collect, void* arg2 ) {
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  mi_assert_internal(mi_heap_page_is_valid(heap, pq, page, NULL, NULL));
+  mi_collect_t collect = *((mi_collect_t*)arg_collect);
+  _mi_page_free_collect(page, collect >= MI_FORCE);
+  if (mi_page_all_free(page)) {
+    // no more used blocks, free the page.
+    // note: this will free retired pages as well.
+    _mi_page_free(page, pq, collect >= MI_FORCE);
+  }
+  else if (collect == MI_ABANDON) {
+    // still used blocks but the thread is done; abandon the page
+    _mi_page_abandon(page, pq);
+  }
+  return true; // don't break
+}
+
+static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+  return true; // don't break
+}
+
+static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
+{
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+
+  const bool force = collect >= MI_FORCE;
+  _mi_deferred_free(heap, force);
+
+  // note: never reclaim on collect but leave it to threads that need storage to reclaim
+  const bool force_main =
+    #ifdef NDEBUG
+      collect == MI_FORCE
+    #else
+      collect >= MI_FORCE
+    #endif
+      && _mi_is_main_thread() && mi_heap_is_backing(heap) && !heap->no_reclaim;
+
+  if (force_main) {
+    // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments.
+    // if all memory is freed by now, all segments should be freed.
+    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
+  }
+
+  // if abandoning, mark all pages to no longer add to delayed_free
+  if (collect == MI_ABANDON) {
+    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
+  }
+
+  // free all current thread delayed blocks.
+  // (if abandoning, after this there are no more thread-delayed references into the pages.)
+  _mi_heap_delayed_free_all(heap);
+
+  // collect retired pages
+  _mi_heap_collect_retired(heap, force);
+
+  // collect all pages owned by this thread
+  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
+  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );
+
+  // collect abandoned segments (in particular, decommit expired parts of segments in the abandoned segment list)
+  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  _mi_abandoned_collect(heap, collect == MI_FORCE /* force? */, &heap->tld->segments);
+
+  // collect segment local caches
+  if (force) {
+    _mi_segment_thread_collect(&heap->tld->segments);
+  }
+
+  // decommit in global segment caches
+  // note: forced decommit can be quite expensive if many threads are created/destroyed so we do not force on abandonment
+  _mi_segment_cache_collect( collect == MI_FORCE, &heap->tld->os);
+
+  // collect regions on program-exit (or shared library unload)
+  if (force && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
+    //_mi_mem_collect(&heap->tld->os);
+  }
+}
+
+void _mi_heap_collect_abandon(mi_heap_t* heap) {
+  mi_heap_collect_ex(heap, MI_ABANDON);
+}
+
+void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept {
+  mi_heap_collect_ex(heap, (force ? MI_FORCE : MI_NORMAL));
+}
+
+void mi_collect(bool force) mi_attr_noexcept {
+  mi_heap_collect(mi_get_default_heap(), force);
+}
+
+
+/* -----------------------------------------------------------
+  Heap new
+----------------------------------------------------------- */
+
+mi_heap_t* mi_heap_get_default(void) {
+  mi_thread_init();
+  return mi_get_default_heap();
+}
+
+mi_heap_t* mi_heap_get_backing(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  mi_assert_internal(heap!=NULL);
+  mi_heap_t* bheap = heap->tld->heap_backing;
+  mi_assert_internal(bheap!=NULL);
+  mi_assert_internal(bheap->thread_id == _mi_thread_id());
+  return bheap;
+}
+
+mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena( mi_arena_id_t arena_id ) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t);  // todo: OS allocate in secure mode?
+  if (heap==NULL) return NULL;
+  _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
+  heap->tld = bheap->tld;
+  heap->thread_id = _mi_thread_id();
+  heap->arena_id = arena_id;
+  _mi_random_split(&bheap->random, &heap->random);
+  heap->cookie  = _mi_heap_random_next(heap) | 1;
+  heap->keys[0] = _mi_heap_random_next(heap);
+  heap->keys[1] = _mi_heap_random_next(heap);
+  heap->no_reclaim = true;  // don't reclaim abandoned pages or otherwise destroy is unsafe
+  // push on the thread local heaps list
+  heap->next = heap->tld->heaps;
+  heap->tld->heaps = heap;
+  return heap;
+}
+
+mi_decl_nodiscard mi_heap_t* mi_heap_new(void) {
+  return mi_heap_new_in_arena(_mi_arena_id_none());
+}
+
+bool _mi_heap_memid_is_suitable(mi_heap_t* heap, size_t memid) {
+  return _mi_arena_memid_is_suitable(memid, heap->arena_id);
+}
+
+uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
+  return _mi_random_next(&heap->random);
+}
+
+// zero out the page queues
+static void mi_heap_reset_pages(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  // TODO: copy full empty heap instead?
+  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
+#ifdef MI_MEDIUM_DIRECT
+  memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
+#endif
+  _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
+  heap->thread_delayed_free = NULL;
+  heap->page_count = 0;
+}
+
+// called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources.
+static void mi_heap_free(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert_internal(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (mi_heap_is_backing(heap)) return; // dont free the backing heap
+
+  // reset default
+  if (mi_heap_is_default(heap)) {
+    _mi_heap_set_default_direct(heap->tld->heap_backing);
+  }
+
+  // remove ourselves from the thread local heaps list
+  // linear search but we expect the number of heaps to be relatively small
+  mi_heap_t* prev = NULL;
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != heap && curr != NULL) {
+    prev = curr;
+    curr = curr->next;
+  }
+  mi_assert_internal(curr == heap);
+  if (curr == heap) {
+    if (prev != NULL) { prev->next = heap->next; }
+		 else { heap->tld->heaps = heap->next; }
+  }
+  mi_assert_internal(heap->tld->heaps != NULL);
+
+  // and free the used memory
+  mi_free(heap);
+}
+
+
+/* -----------------------------------------------------------
+  Heap destroy
+----------------------------------------------------------- */
+
+static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
+  MI_UNUSED(arg1);
+  MI_UNUSED(arg2);
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+
+  // ensure no more thread_delayed_free will be added
+  _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false);
+
+  // stats
+  const size_t bsize = mi_page_block_size(page);
+  if (bsize > MI_MEDIUM_OBJ_SIZE_MAX) {
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_decrease(heap, large, bsize);
+    }
+    else {
+      mi_heap_stat_decrease(heap, huge, bsize);
+    }
+  }
+#if (MI_STAT)
+  _mi_page_free_collect(page, false);  // update used count
+  const size_t inuse = page->used;
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize * inuse);
+#if (MI_STAT>1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
+#endif
+  }
+  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
+#endif
+
+  /// pretend it is all free now
+  mi_assert_internal(mi_page_thread_free(page) == NULL);
+  page->used = 0;
+
+  // and free the page
+  // mi_page_free(page,false);
+  page->next = NULL;
+  page->prev = NULL;
+  _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments);
+
+  return true; // keep going
+}
+
+void _mi_heap_destroy_pages(mi_heap_t* heap) {
+  mi_heap_visit_pages(heap, &_mi_heap_page_destroy, NULL, NULL);
+  mi_heap_reset_pages(heap);
+}
+
+void mi_heap_destroy(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert(heap->no_reclaim);
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+  if (!heap->no_reclaim) {
+    // don't free in case it may contain reclaimed pages
+    mi_heap_delete(heap);
+  }
+  else {
+    // free all pages
+    _mi_heap_destroy_pages(heap);
+    mi_heap_free(heap);
+  }
+}
+
+void _mi_heap_destroy_all(void) {
+  mi_heap_t* bheap = mi_heap_get_backing();
+  mi_heap_t* curr = bheap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next;
+    if (curr->no_reclaim) {
+      mi_heap_destroy(curr);
+    }
+    else {
+      _mi_heap_destroy_pages(curr);
+    }
+    curr = next;
+  }
+}
+
+/* -----------------------------------------------------------
+  Safe Heap delete
+----------------------------------------------------------- */
+
+// Transfer the pages from one heap to the other
+static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
+  mi_assert_internal(heap!=NULL);
+  if (from==NULL || from->page_count == 0) return;
+
+  // reduce the size of the delayed frees
+  _mi_heap_delayed_free_partial(from);
+
+  // transfer all pages by appending the queues; this will set a new heap field
+  // so threads may do delayed frees in either heap for a while.
+  // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state
+  // so after this only the new heap will get delayed frees
+  for (size_t i = 0; i <= MI_BIN_FULL; i++) {
+    mi_page_queue_t* pq = &heap->pages[i];
+    mi_page_queue_t* append = &from->pages[i];
+    size_t pcount = _mi_page_queue_append(heap, pq, append);
+    heap->page_count += pcount;
+    from->page_count -= pcount;
+  }
+  mi_assert_internal(from->page_count == 0);
+
+  // and do outstanding delayed frees in the `from` heap
+  // note: be careful here as the `heap` field in all those pages no longer point to `from`,
+  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a
+  // the regular `_mi_free_delayed_block` which is safe.
+  _mi_heap_delayed_free_all(from);
+  #if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);
+  #endif
+
+  // and reset the `from` heap
+  mi_heap_reset_pages(from);
+}
+
+// Safe delete a heap without freeing any still allocated blocks in that heap.
+void mi_heap_delete(mi_heap_t* heap)
+{
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
+
+  if (!mi_heap_is_backing(heap)) {
+    // tranfer still used pages to the backing heap
+    mi_heap_absorb(heap->tld->heap_backing, heap);
+  }
+  else {
+    // the backing heap abandons its pages
+    _mi_heap_collect_abandon(heap);
+  }
+  mi_assert_internal(heap->page_count==0);
+  mi_heap_free(heap);
+}
+
+mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
+  mi_assert(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
+  mi_assert_expensive(mi_heap_is_valid(heap));
+  mi_heap_t* old = mi_get_default_heap();
+  _mi_heap_set_default_direct(heap);
+  return old;
+}
+
+
+
+
+/* -----------------------------------------------------------
+  Analysis
+----------------------------------------------------------- */
+
+// static since it is not thread safe to access heaps from other threads.
+static mi_heap_t* mi_heap_of_block(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  bool valid = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(valid);
+  if mi_unlikely(!valid) return NULL;
+  return mi_page_heap(_mi_segment_page_of(segment,p));
+}
+
+bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  return (heap == mi_heap_of_block(p));
+}
+
+
+static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* p, void* vfound) {
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  bool* found = (bool*)vfound;
+  mi_segment_t* segment = _mi_page_segment(page);
+  void* start = _mi_page_start(segment, page, NULL);
+  void* end   = (uint8_t*)start + (page->capacity * mi_page_block_size(page));
+  *found = (p >= start && p < end);
+  return (!*found); // continue if not found
+}
+
+bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
+  mi_assert(heap != NULL);
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
+  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
+  bool found = false;
+  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
+  return found;
+}
+
+bool mi_check_owned(const void* p) {
+  return mi_heap_check_owned(mi_get_default_heap(), p);
+}
+
+/* -----------------------------------------------------------
+  Visit all heap blocks and areas
+  Todo: enable visiting abandoned pages, and
+	enable visiting all blocks of all heaps across threads
+----------------------------------------------------------- */
+
+// Separate struct to keep `mi_page_t` out of the public interface
+typedef struct mi_heap_area_ex_s {
+  mi_heap_area_t area;
+  mi_page_t*     page;
+} mi_heap_area_ex_t;
+
+static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) {
+  mi_assert(xarea != NULL);
+  if (xarea==NULL) return true;
+  const mi_heap_area_t* area = &xarea->area;
+  mi_page_t* page = xarea->page;
+  mi_assert(page != NULL);
+  if (page == NULL) return true;
+
+  _mi_page_free_collect(page,true);
+  mi_assert_internal(page->local_free == NULL);
+  if (page->used == 0) return true;
+
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page); // without padding
+  size_t   psize;
+  uint8_t* pstart = _mi_page_start(_mi_page_segment(page), page, &psize);
+
+  if (page->capacity == 1) {
+    // optimize page with one block
+    mi_assert_internal(page->used == 1 && page->free == NULL);
+    return visitor(mi_page_heap(page), area, pstart, ubsize, arg);
+  }
+
+  // create a bitmap of free blocks.
+  #define MI_MAX_BLOCKS   (MI_SMALL_PAGE_SIZE / sizeof(void*))
+  uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)];
+  memset(free_map, 0, sizeof(free_map));
+
+  size_t free_count = 0;
+  for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+    free_count++;
+    mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize));
+    size_t offset = (uint8_t*)block - pstart;
+    mi_assert_internal(offset % bsize == 0);
+    size_t blockidx = offset / bsize;  // Todo: avoid division?
+    mi_assert_internal( blockidx < MI_MAX_BLOCKS);
+    size_t bitidx = (blockidx / sizeof(uintptr_t));
+    size_t bit = blockidx - (bitidx * sizeof(uintptr_t));
+    free_map[bitidx] |= ((uintptr_t)1 << bit);
+  }
+  mi_assert_internal(page->capacity == (free_count + page->used));
+
+  // walk through all blocks skipping the free ones
+  size_t used_count = 0;
+  for (size_t i = 0; i < page->capacity; i++) {
+    size_t bitidx = (i / sizeof(uintptr_t));
+    size_t bit = i - (bitidx * sizeof(uintptr_t));
+    uintptr_t m = free_map[bitidx];
+    if (bit == 0 && m == UINTPTR_MAX) {
+      i += (sizeof(uintptr_t) - 1); // skip a run of free blocks
+    }
+    else if ((m & ((uintptr_t)1 << bit)) == 0) {
+      used_count++;
+      uint8_t* block = pstart + (i * bsize);
+      if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false;
+    }
+  }
+  mi_assert_internal(page->used == used_count);
+  return true;
+}
+
+typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg);
+
+
+static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) {
+  MI_UNUSED(heap);
+  MI_UNUSED(pq);
+  mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun;
+  mi_heap_area_ex_t xarea;
+  const size_t bsize = mi_page_block_size(page);
+  const size_t ubsize = mi_page_usable_block_size(page);
+  xarea.page = page;
+  xarea.area.reserved = page->reserved * bsize;
+  xarea.area.committed = page->capacity * bsize;
+  xarea.area.blocks = _mi_page_start(_mi_page_segment(page), page, NULL);
+  xarea.area.used = page->used;   // number of blocks in use (#553)
+  xarea.area.block_size = ubsize;
+  xarea.area.full_block_size = bsize;
+  return fun(heap, &xarea, arg);
+}
+
+// Visit all heap pages as areas
+static bool mi_heap_visit_areas(const mi_heap_t* heap, mi_heap_area_visit_fun* visitor, void* arg) {
+  if (visitor == NULL) return false;
+  return mi_heap_visit_pages((mi_heap_t*)heap, &mi_heap_visit_areas_page, (void*)(visitor), arg); // note: function pointer to void* :-{
+}
+
+// Just to pass arguments
+typedef struct mi_visit_blocks_args_s {
+  bool  visit_blocks;
+  mi_block_visit_fun* visitor;
+  void* arg;
+} mi_visit_blocks_args_t;
+
+static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* xarea, void* arg) {
+  mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg;
+  if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false;
+  if (args->visit_blocks) {
+    return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg);
+  }
+  else {
+    return true;
+  }
+}
+
+// Visit all blocks in a heap
+bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) {
+  mi_visit_blocks_args_t args = { visit_blocks, visitor, arg };
+  return mi_heap_visit_areas(heap, &mi_heap_area_visitor, &args);
+}
diff --git a/compat/mimalloc/init.c b/compat/mimalloc/init.c
new file mode 100644
index 0000000000..69f8456fa1
--- /dev/null
+++ b/compat/mimalloc/init.c
@@ -0,0 +1,720 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h>  // memcpy, memset
+#include <stdlib.h>  // atexit
+
+// Empty page used to initialize the small free pages array
+const mi_page_t _mi_page_empty = {
+  0, false, false, false, false,
+  0,       // capacity
+  0,       // reserved capacity
+  { 0 },   // flags
+  false,   // is_zero
+  0,       // retire_expire
+  NULL,    // free
+  0,       // used
+  0,       // xblock_size
+  NULL,    // local_free
+  #if MI_ENCODE_FREELIST
+  { 0, 0 },
+  #endif
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xheap
+  NULL, NULL
+  #if MI_INTPTR_SIZE==8
+  , { 0 }  // padding
+  #endif
+};
+
+#define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
+
+#if (MI_PADDING>0) && (MI_INTPTR_SIZE >= 8)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#elif (MI_PADDING>0)
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY(), MI_PAGE_EMPTY(), MI_PAGE_EMPTY() }
+#else
+#define MI_SMALL_PAGES_EMPTY  { MI_INIT128(MI_PAGE_EMPTY), MI_PAGE_EMPTY() }
+#endif
+
+
+// Empty page queues for every bin
+#define QNULL(sz)  { NULL, NULL, (sz)*sizeof(uintptr_t) }
+#define MI_PAGE_QUEUES_EMPTY \
+  { QNULL(1), \
+    QNULL(     1), QNULL(     2), QNULL(     3), QNULL(     4), QNULL(     5), QNULL(     6), QNULL(     7), QNULL(     8), /* 8 */ \
+    QNULL(    10), QNULL(    12), QNULL(    14), QNULL(    16), QNULL(    20), QNULL(    24), QNULL(    28), QNULL(    32), /* 16 */ \
+    QNULL(    40), QNULL(    48), QNULL(    56), QNULL(    64), QNULL(    80), QNULL(    96), QNULL(   112), QNULL(   128), /* 24 */ \
+    QNULL(   160), QNULL(   192), QNULL(   224), QNULL(   256), QNULL(   320), QNULL(   384), QNULL(   448), QNULL(   512), /* 32 */ \
+    QNULL(   640), QNULL(   768), QNULL(   896), QNULL(  1024), QNULL(  1280), QNULL(  1536), QNULL(  1792), QNULL(  2048), /* 40 */ \
+    QNULL(  2560), QNULL(  3072), QNULL(  3584), QNULL(  4096), QNULL(  5120), QNULL(  6144), QNULL(  7168), QNULL(  8192), /* 48 */ \
+    QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
+    QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
+    QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \
+    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 1  /* 655360, Huge queue */), \
+    QNULL(MI_MEDIUM_OBJ_WSIZE_MAX + 2) /* Full queue */ }
+
+#define MI_STAT_COUNT_NULL()  {0,0,0,0}
+
+// Empty statistics
+#if MI_STAT>1
+#define MI_STAT_COUNT_END_NULL()  , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
+#else
+#define MI_STAT_COUNT_END_NULL()
+#endif
+
+#define MI_STATS_NULL  \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
+  MI_STAT_COUNT_END_NULL()
+
+
+// Empty slice span queues for every bin
+#define SQNULL(sz)  { NULL, NULL, sz }
+#define MI_SEGMENT_SPAN_QUEUES_EMPTY \
+  { SQNULL(1), \
+    SQNULL(     1), SQNULL(     2), SQNULL(     3), SQNULL(     4), SQNULL(     5), SQNULL(     6), SQNULL(     7), SQNULL(    10), /*  8 */ \
+    SQNULL(    12), SQNULL(    14), SQNULL(    16), SQNULL(    20), SQNULL(    24), SQNULL(    28), SQNULL(    32), SQNULL(    40), /* 16 */ \
+    SQNULL(    48), SQNULL(    56), SQNULL(    64), SQNULL(    80), SQNULL(    96), SQNULL(   112), SQNULL(   128), SQNULL(   160), /* 24 */ \
+    SQNULL(   192), SQNULL(   224), SQNULL(   256), SQNULL(   320), SQNULL(   384), SQNULL(   448), SQNULL(   512), SQNULL(   640), /* 32 */ \
+    SQNULL(   768), SQNULL(   896), SQNULL(  1024) /* 35 */ }
+
+
+// --------------------------------------------------------
+// Statically allocate an empty heap as the initial
+// thread local value for the default heap,
+// and statically allocate the backing heap for the main
+// thread so it can function without doing any allocation
+// itself (as accessing a thread local for the first time
+// may lead to allocation itself on some platforms)
+// --------------------------------------------------------
+
+mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
+  NULL,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_ATOMIC_VAR_INIT(NULL),
+  0,                // tid
+  0,                // cookie
+  0,                // arena id
+  { 0, 0 },         // keys
+  { {0}, {0}, 0, true }, // random
+  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next
+  false
+};
+
+#define tld_empty_stats  ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
+#define tld_empty_os     ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
+
+mi_decl_cache_align static const mi_tld_t tld_empty = {
+  0,
+  false,
+  NULL, NULL,
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, tld_empty_stats, tld_empty_os }, // segments
+  { 0, tld_empty_stats }, // os
+  { MI_STATS_NULL }       // stats
+};
+
+// the thread-local default heap for allocation
+mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
+
+extern mi_heap_t _mi_heap_main;
+
+static mi_tld_t tld_main = {
+  0, false,
+  &_mi_heap_main, & _mi_heap_main,
+  { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments
+  { 0, &tld_main.stats },  // os
+  { MI_STATS_NULL }       // stats
+};
+
+mi_heap_t _mi_heap_main = {
+  &tld_main,
+  MI_SMALL_PAGES_EMPTY,
+  MI_PAGE_QUEUES_EMPTY,
+  MI_ATOMIC_VAR_INIT(NULL),
+  0,                // thread id
+  0,                // initial cookie
+  0,                // arena id
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0, true },  // random
+  0,                // page count
+  MI_BIN_FULL, 0,   // page retired min/max
+  NULL,             // next heap
+  false             // can reclaim
+};
+
+bool _mi_process_is_initialized = false;  // set to `true` in `mi_process_init`.
+
+mi_stats_t _mi_stats_main = { MI_STATS_NULL };
+
+
+static void mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = 1;
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+      _mi_random_init_weak(&_mi_heap_main.random);    // prevent allocation failure during bcrypt dll initialization with static linking
+    #else
+      _mi_random_init(&_mi_heap_main.random);
+    #endif
+    _mi_heap_main.cookie  = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
+mi_heap_t* _mi_heap_main_get(void) {
+  mi_heap_main_init();
+  return &_mi_heap_main;
+}
+
+
+/* -----------------------------------------------------------
+  Initialization and freeing of the thread local heaps
+----------------------------------------------------------- */
+
+// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size).
+typedef struct mi_thread_data_s {
+  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
+  mi_tld_t   tld;
+} mi_thread_data_t;
+
+
+// Thread meta-data is allocated directly from the OS. For
+// some programs that do not use thread pools and allocate and
+// destroy many OS threads, this may causes too much overhead
+// per thread so we maintain a small cache of recently freed metadata.
+
+#define TD_CACHE_SIZE (8)
+static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE];
+
+static mi_thread_data_t* mi_thread_data_alloc(void) {
+  // try to find thread metadata in the cache
+  mi_thread_data_t* td;
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+	return td;
+      }
+    }
+  }
+  // if that fails, allocate directly from the OS
+  td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+  if (td == NULL) {
+    // if this fails, try once more. (issue #257)
+    td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+    if (td == NULL) {
+      // really out of memory
+      _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+    }
+  }
+  return td;
+}
+
+static void mi_thread_data_free( mi_thread_data_t* tdfree ) {
+  // try to add the thread metadata to the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td == NULL) {
+      mi_thread_data_t* expected = NULL;
+      if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) {
+	return;
+      }
+    }
+  }
+  // if that fails, just free it directly
+  _mi_os_free(tdfree, sizeof(mi_thread_data_t), &_mi_stats_main);
+}
+
+static void mi_thread_data_collect(void) {
+  // free all thread metadata from the cache
+  for (int i = 0; i < TD_CACHE_SIZE; i++) {
+    mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]);
+    if (td != NULL) {
+      td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL);
+      if (td != NULL) {
+	_mi_os_free( td, sizeof(mi_thread_data_t), &_mi_stats_main );
+      }
+    }
+  }
+}
+
+// Initialize the thread local default heap, called from `mi_thread_init`
+static bool _mi_heap_init(void) {
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
+  if (_mi_is_main_thread()) {
+    // mi_assert_internal(_mi_heap_main.thread_id != 0);  // can happen on freeBSD where alloc is called before any initialization
+    // the main heap is statically allocated
+    mi_heap_main_init();
+    _mi_heap_set_default_direct(&_mi_heap_main);
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+  }
+  else {
+    // use `_mi_os_alloc` to allocate directly from the OS
+    mi_thread_data_t* td = mi_thread_data_alloc();
+    if (td == NULL) return false;
+
+    // OS allocated so already zero initialized
+    mi_tld_t*  tld = &td->tld;
+    mi_heap_t* heap = &td->heap;
+    _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
+    _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
+    heap->thread_id = _mi_thread_id();
+    #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+    _mi_random_init_weak(&heap->random); // match mi_heap_main_init()
+    #else
+    _mi_random_init(&heap->random);
+    #endif
+    heap->cookie  = _mi_heap_random_next(heap) | 1;
+    heap->keys[0] = _mi_heap_random_next(heap);
+    heap->keys[1] = _mi_heap_random_next(heap);
+    heap->tld = tld;
+    tld->heap_backing = heap;
+    tld->heaps = heap;
+    tld->segments.stats = &tld->stats;
+    tld->segments.os = &tld->os;
+    tld->os.stats = &tld->stats;
+    _mi_heap_set_default_direct(heap);
+  }
+  return false;
+}
+
+// Free the thread local default heap (called from `mi_thread_done`)
+static bool _mi_heap_done(mi_heap_t* heap) {
+  if (!mi_heap_is_initialized(heap)) return true;
+
+  // reset default heap
+  _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty);
+
+  // switch to backing heap
+  heap = heap->tld->heap_backing;
+  if (!mi_heap_is_initialized(heap)) return false;
+
+  // delete all non-backing heaps in this thread
+  mi_heap_t* curr = heap->tld->heaps;
+  while (curr != NULL) {
+    mi_heap_t* next = curr->next; // save `next` as `curr` will be freed
+    if (curr != heap) {
+      mi_assert_internal(!mi_heap_is_backing(curr));
+      mi_heap_delete(curr);
+    }
+    curr = next;
+  }
+  mi_assert_internal(heap->tld->heaps == heap && heap->next == NULL);
+  mi_assert_internal(mi_heap_is_backing(heap));
+
+  // collect if not the main thread
+  if (heap != &_mi_heap_main) {
+    _mi_heap_collect_abandon(heap);
+  }
+
+  // merge stats
+  _mi_stats_done(&heap->tld->stats);
+
+  // free if not the main thread
+  if (heap != &_mi_heap_main) {
+    // the following assertion does not always hold for huge segments as those are always treated
+    // as abondened: one may allocate it in one thread, but deallocate in another in which case
+    // the count can be too large or negative. todo: perhaps not count huge segments? see issue #363
+    // mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id());
+    mi_thread_data_free((mi_thread_data_t*)heap);
+  }
+  else {
+    mi_thread_data_collect(); // free cached thread metadata
+    #if 0
+    // never free the main thread even in debug mode; if a dll is linked statically with mimalloc,
+    // there may still be delete/free calls after the mi_fls_done is called. Issue #207
+    _mi_heap_destroy_pages(heap);
+    mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main);
+    #endif
+  }
+  return false;
+}
+
+
+
+// --------------------------------------------------------
+// Try to run `mi_thread_done()` automatically so any memory
+// owned by the thread but not yet released can be abandoned
+// and re-owned by another thread.
+//
+// 1. windows dynamic library:
+//     call from DllMain on DLL_THREAD_DETACH
+// 2. windows static library:
+//     use `FlsAlloc` to call a destructor when the thread is done
+// 3. unix, pthreads:
+//     use a pthread key to call a destructor when a pthread is done
+//
+// In the last two cases we also need to call `mi_process_init`
+// to set up the thread local keys.
+// --------------------------------------------------------
+
+static void _mi_thread_done(mi_heap_t* default_heap);
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // nothing to do as it is done in DllMain
+#elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // use thread local storage keys to detect thread ending
+  #include <windows.h>
+  #include <fibersapi.h>
+  #if (_WIN32_WINNT < 0x600)  // before Windows Vista
+  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
+  WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex );
+  WINBASEAPI BOOL  WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData );
+  WINBASEAPI BOOL  WINAPI FlsFree(_In_ DWORD dwFlsIndex);
+  #endif
+  static DWORD mi_fls_key = (DWORD)(-1);
+  static void NTAPI mi_fls_done(PVOID value) {
+    mi_heap_t* heap = (mi_heap_t*)value;
+    if (heap != NULL) {
+      _mi_thread_done(heap);
+      FlsSetValue(mi_fls_key, NULL);  // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672
+    }
+  }
+#elif defined(MI_USE_PTHREADS)
+  // use pthread local storage keys to detect thread ending
+  // (and used with MI_TLS_PTHREADS for the default heap)
+  pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1);
+  static void mi_pthread_done(void* value) {
+    if (value!=NULL) _mi_thread_done((mi_heap_t*)value);
+  }
+#elif defined(__wasi__)
+// no pthreads in the WebAssembly Standard Interface
+#else
+  #pragma message("define a way to call mi_thread_done when a thread is done")
+#endif
+
+// Set up handlers so `mi_thread_done` is called automatically
+static void mi_process_setup_auto_thread_done(void) {
+  static bool tls_initialized = false; // fine if it races
+  if (tls_initialized) return;
+  tls_initialized = true;
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_fls_key = FlsAlloc(&mi_fls_done);
+  #elif defined(MI_USE_PTHREADS)
+    mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1));
+    pthread_key_create(&_mi_heap_default_key, &mi_pthread_done);
+  #endif
+  _mi_heap_set_default_direct(&_mi_heap_main);
+}
+
+
+bool _mi_is_main_thread(void) {
+  return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id());
+}
+
+static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1);
+
+size_t  _mi_current_thread_count(void) {
+  return mi_atomic_load_relaxed(&thread_count);
+}
+
+// This is called from the `mi_malloc_generic`
+void mi_thread_init(void) mi_attr_noexcept
+{
+  // ensure our process has started already
+  mi_process_init();
+
+  // initialize the thread local default heap
+  // (this will call `_mi_heap_set_default_direct` and thus set the
+  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
+  if (_mi_heap_init()) return;  // returns true if already initialized
+
+  _mi_stat_increase(&_mi_stats_main.threads, 1);
+  mi_atomic_increment_relaxed(&thread_count);
+  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
+}
+
+void mi_thread_done(void) mi_attr_noexcept {
+  _mi_thread_done(mi_get_default_heap());
+}
+
+static void _mi_thread_done(mi_heap_t* heap) {
+  mi_atomic_decrement_relaxed(&thread_count);
+  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+
+  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
+  if (heap->thread_id != _mi_thread_id()) return;
+
+  // abandon the thread local heap
+  if (_mi_heap_done(heap)) return;  // returns true if already ran
+}
+
+void _mi_heap_set_default_direct(mi_heap_t* heap)  {
+  mi_assert_internal(heap != NULL);
+  #if defined(MI_TLS_SLOT)
+  mi_tls_slot_set(MI_TLS_SLOT,heap);
+  #elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  *mi_tls_pthread_heap_slot() = heap;
+  #elif defined(MI_TLS_PTHREAD)
+  // we use _mi_heap_default_key
+  #else
+  _mi_heap_default = heap;
+  #endif
+
+  // ensure the default heap is passed to `_mi_thread_done`
+  // setting to a non-NULL value also ensures `mi_thread_done` is called.
+  #if defined(_WIN32) && defined(MI_SHARED_LIB)
+    // nothing to do as it is done in DllMain
+  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
+    mi_assert_internal(mi_fls_key != 0);
+    FlsSetValue(mi_fls_key, heap);
+  #elif defined(MI_USE_PTHREADS)
+  if (_mi_heap_default_key != (pthread_key_t)(-1)) {  // can happen during recursive invocation on freeBSD
+    pthread_setspecific(_mi_heap_default_key, heap);
+  }
+  #endif
+}
+
+
+// --------------------------------------------------------
+// Run functions on process init/done, and thread init/done
+// --------------------------------------------------------
+static void mi_cdecl mi_process_done(void);
+
+static bool os_preloading = true;    // true until this module is initialized
+static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+
+// Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
+bool _mi_preloading(void) {
+  return os_preloading;
+}
+
+mi_decl_nodiscard bool mi_is_redirected(void) mi_attr_noexcept {
+  return mi_redirected;
+}
+
+// Communicate with the redirection module on Windows
+#if defined(_WIN32) && defined(MI_SHARED_LIB) && !defined(MI_WIN_NOREDIRECT)
+#ifdef __cplusplus
+extern "C" {
+#endif
+mi_decl_export void _mi_redirect_entry(DWORD reason) {
+  // called on redirection; careful as this may be called before DllMain
+  if (reason == DLL_PROCESS_ATTACH) {
+    mi_redirected = true;
+  }
+  else if (reason == DLL_PROCESS_DETACH) {
+    mi_redirected = false;
+  }
+  else if (reason == DLL_THREAD_DETACH) {
+    mi_thread_done();
+  }
+}
+__declspec(dllimport) bool mi_cdecl mi_allocator_init(const char** message);
+__declspec(dllimport) void mi_cdecl mi_allocator_done(void);
+#ifdef __cplusplus
+}
+#endif
+#else
+static bool mi_allocator_init(const char** message) {
+  if (message != NULL) *message = NULL;
+  return true;
+}
+static void mi_allocator_done(void) {
+  // nothing to do
+}
+#endif
+
+// Called once by the process loader
+static void mi_process_load(void) {
+  mi_heap_main_init();
+  #if defined(MI_TLS_RECURSE_GUARD)
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  MI_UNUSED(dummy);
+  #endif
+  os_preloading = false;
+  mi_assert_internal(_mi_is_main_thread());
+  #if !(defined(_WIN32) && defined(MI_SHARED_LIB))  // use Dll process detach (see below) instead of atexit (issue #521)
+  atexit(&mi_process_done);
+  #endif
+  _mi_options_init();
+  mi_process_setup_auto_thread_done();
+  mi_process_init();
+  if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
+
+  // show message from the redirector (if present)
+  const char* msg = NULL;
+  mi_allocator_init(&msg);
+  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
+    _mi_fputs(NULL,NULL,NULL,msg);
+  }
+
+  // reseed random
+  _mi_random_reinit_if_weak(&_mi_heap_main.random);
+}
+
+#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
+
+static void mi_detect_cpu_features(void) {
+  // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
+  int32_t cpu_info[4];
+  __cpuid(cpu_info, 7);
+  _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
+}
+#else
+static void mi_detect_cpu_features(void) {
+  // nothing
+}
+#endif
+
+// Initialize the process; called by thread_init or the process loader
+void mi_process_init(void) mi_attr_noexcept {
+  // ensure we are called once
+  if (_mi_process_is_initialized) return;
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
+  _mi_process_is_initialized = true;
+  mi_process_setup_auto_thread_done();
+
+  mi_detect_cpu_features();
+  _mi_os_init();
+  mi_heap_main_init();
+  #if (MI_DEBUG)
+  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
+  #endif
+  _mi_verbose_message("secure level: %d\n", MI_SECURE);
+  _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
+  mi_thread_init();
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  // When building as a static lib the FLS cleanup happens to early for the main thread.
+  // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup
+  // will not call _mi_thread_done on the (still executing) main thread. See issue #508.
+  FlsSetValue(mi_fls_key, NULL);
+  #endif
+
+  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages = mi_option_get_clamp(mi_option_reserve_huge_os_pages, 0, 128*1024);
+    long reserve_at = mi_option_get(mi_option_reserve_huge_os_pages_at);
+    if (reserve_at != -1) {
+      mi_reserve_huge_os_pages_at(pages, reserve_at, pages*500);
+    } else {
+      mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+    }
+  }
+  if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
+    long ksize = mi_option_get(mi_option_reserve_os_memory);
+    if (ksize > 0) {
+      mi_reserve_os_memory((size_t)ksize*MI_KiB, true /* commit? */, true /* allow large pages? */);
+    }
+  }
+}
+
+// Called when the process is done (through `at_exit`)
+static void mi_cdecl mi_process_done(void) {
+  // only shutdown if we were initialized
+  if (!_mi_process_is_initialized) return;
+  // ensure we are called once
+  static bool process_done = false;
+  if (process_done) return;
+  process_done = true;
+
+  #if defined(_WIN32) && !defined(MI_SHARED_LIB)
+  FlsFree(mi_fls_key);  // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208
+  #endif
+
+  #ifndef MI_SKIP_COLLECT_ON_EXIT
+    #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)
+    // free all memory if possible on process exit. This is not needed for a stand-alone process
+    // but should be done if mimalloc is statically linked into another shared library which
+    // is repeatedly loaded/unloaded, see issue #281.
+    mi_collect(true /* force */ );
+    #endif
+  #endif
+
+  // Forcefully release all retained memory; this can be dangerous in general if overriding regular malloc/free
+  // since after process_done there might still be other code running that calls `free` (like at_exit routines,
+  // or C-runtime termination code.
+  if (mi_option_is_enabled(mi_option_destroy_on_exit)) {
+    _mi_heap_destroy_all();                          // forcefully release all memory held by all heaps (of this thread only!)
+    _mi_segment_cache_free_all(&_mi_heap_main_get()->tld->os);  // release all cached segments
+  }
+
+  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
+    mi_stats_print(NULL);
+  }
+  mi_allocator_done();
+  _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id);
+  os_preloading = true; // don't call the C runtime anymore
+}
+
+
+
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
+  // Windows DLL: easy to hook into process_init and thread_done
+  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
+    MI_UNUSED(reserved);
+    MI_UNUSED(inst);
+    if (reason==DLL_PROCESS_ATTACH) {
+      mi_process_load();
+    }
+    else if (reason==DLL_PROCESS_DETACH) {
+      mi_process_done();
+    }
+    else if (reason==DLL_THREAD_DETACH) {
+      if (!mi_is_redirected()) {
+	mi_thread_done();
+      }
+    }
+    return TRUE;
+  }
+
+#elif defined(_MSC_VER)
+  // MSVC: use data section magic for static libraries
+  // See <https://www.codeguru.com/cpp/misc/misc/applicationcontrol/article.php/c6945/Running-Code-Before-and-After-Main.htm>
+  static int _mi_process_init(void) {
+    mi_process_load();
+    return 0;
+  }
+  typedef int(*_mi_crt_callback_t)(void);
+  #if defined(_M_X64) || defined(_M_ARM64)
+    __pragma(comment(linker, "/include:" "_mi_msvc_initu"))
+    #pragma section(".CRT$XIU", long, read)
+  #else
+    __pragma(comment(linker, "/include:" "__mi_msvc_initu"))
+  #endif
+  #pragma data_seg(".CRT$XIU")
+  mi_decl_externc _mi_crt_callback_t _mi_msvc_initu[] = { &_mi_process_init };
+  #pragma data_seg()
+
+#elif defined(__cplusplus)
+  // C++: use static initialization to detect process start
+  static bool _mi_process_init(void) {
+    mi_process_load();
+    return (_mi_heap_main.thread_id != 0);
+  }
+  static bool mi_initialized = _mi_process_init();
+
+#elif defined(__GNUC__) || defined(__clang__)
+  // GCC,Clang: use the constructor attribute
+  static void __attribute__((constructor)) _mi_process_init(void) {
+    mi_process_load();
+  }
+
+#else
+#pragma message("define a way to call mi_process_load on your platform")
+#endif
diff --git a/compat/mimalloc/mimalloc-atomic.h b/compat/mimalloc/mimalloc-atomic.h
new file mode 100644
index 0000000000..c66f804933
--- /dev/null
+++ b/compat/mimalloc/mimalloc-atomic.h
@@ -0,0 +1,338 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_ATOMIC_H
+#define MIMALLOC_ATOMIC_H
+
+// --------------------------------------------------------------------------------------------
+// Atomics
+// We need to be portable between C, C++, and MSVC.
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode.
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types.
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+// Use C++ atomics
+#include <atomic>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
+#if !defined(ATOMIC_VAR_INIT) || (__cplusplus >= 202002L) // c++20, see issue #571
+ #define MI_ATOMIC_VAR_INIT(x)  x
+#else
+ #define MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+#elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
+#define  _Atomic(tp)            tp
+#define  MI_ATOMIC_VAR_INIT(x)  x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
+#else
+// Use C11 atomics
+#include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
+#define  MI_ATOMIC_VAR_INIT(x)  ATOMIC_VAR_INIT(x)
+#endif
+
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)
+
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
+
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,(uintptr_t)1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,(uintptr_t)1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,(uintptr_t)1)
+
+static inline void mi_atomic_yield(void);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
+
+
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+
+// In C++ we need to add casts to help resolve templates if NULL is passed
+#if defined(__cplusplus)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,(tp*)x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,(tp*)x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
+#else
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+#endif
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
+}
+
+// Used by timers
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+
+
+
+#elif defined(_MSC_VER)
+
+// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+#ifdef _WIN64
+typedef LONG64   msc_intptr_t;
+#define MI_64(f) f##64
+#else
+typedef LONG     msc_intptr_t;
+#define MI_64(f) f
+#endif
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  (void)(mo1); (void)(mo2);
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
+}
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
+}
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+}
+static inline void mi_atomic_thread_fence(mi_memory_order mo) {
+  (void)(mo);
+  _Atomic(uintptr_t) x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
+}
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  return *p;
+#else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  mi_atomic_exchange_explicit(p, x, mo);
+#endif
+}
+static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_X64)
+  return *p;
+#else
+  int64_t old = *p;
+  int64_t x = old;
+  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
+    x = old;
+  }
+  return x;
+#endif
+}
+static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(x_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  InterlockedExchange64(p, x);
+#endif
+}
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
+#ifdef _WIN64
+  return (int64_t)mi_atomic_addi((int64_t*)p, add);
+#else
+  int64_t current;
+  int64_t sum;
+  do {
+    current = *p;
+    sum = current + add;
+  } while (_InterlockedCompareExchange64(p, sum, current) != current);
+  return current;
+#endif
+}
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
+  int64_t current;
+  do {
+    current = *p;
+  } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
+}
+
+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed))
+
+
+#endif
+
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+// Yield
+#if defined(__cplusplus)
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+static inline void mi_atomic_yield(void) {
+  _mm_pause();
+}
+#elif (defined(__GNUC__) || defined(__clang__)) && \
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
+#if defined(__x86_64__) || defined(__i386__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("pause" ::: "memory");
+}
+#elif defined(__aarch64__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("wfe");
+}
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+}
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("nop" ::: "memory");
+}
+#endif
+#elif defined(__sun)
+// Fallback for other archs
+#include <synch.h>
+static inline void mi_atomic_yield(void) {
+  smt_pause();
+}
+#elif defined(__wasi__)
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
+#else
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
+#endif
+
+
+#endif // __MIMALLOC_ATOMIC_H
diff --git a/compat/mimalloc/mimalloc-internal.h b/compat/mimalloc/mimalloc-internal.h
new file mode 100644
index 0000000000..60845ae416
--- /dev/null
+++ b/compat/mimalloc/mimalloc-internal.h
@@ -0,0 +1,1116 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_INTERNAL_H
+#define MIMALLOC_INTERNAL_H
+
+#include "mimalloc-types.h"
+#include "mimalloc-track.h"
+
+#if (MI_DEBUG>0)
+#define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
+#else
+#define mi_trace_message(...)
+#endif
+
+#define MI_CACHE_LINE          64
+#if defined(_MSC_VER)
+#pragma warning(disable:4127)   // suppress constant conditional warning (due to MI_SECURE paths)
+#pragma warning(disable:26812)  // unscoped enum warning
+#define mi_decl_noinline        __declspec(noinline)
+#define mi_decl_thread          __declspec(thread)
+#define mi_decl_cache_align     __declspec(align(MI_CACHE_LINE))
+#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc
+#define mi_decl_noinline        __attribute__((noinline))
+#define mi_decl_thread          __thread
+#define mi_decl_cache_align     __attribute__((aligned(MI_CACHE_LINE)))
+#else
+#define mi_decl_noinline
+#define mi_decl_thread          __thread        // hope for the best :-)
+#define mi_decl_cache_align
+#endif
+
+#if defined(__EMSCRIPTEN__) && !defined(__wasi__)
+#define __wasi__
+#endif
+
+#if defined(__cplusplus)
+#define mi_decl_externc       extern "C"
+#else
+#define mi_decl_externc
+#endif
+
+#if !defined(_WIN32) && !defined(__wasi__)
+#define  MI_USE_PTHREADS
+#include <pthread.h>
+#endif
+
+// "options.c"
+void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
+void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
+void       _mi_warning_message(const char* fmt, ...);
+void       _mi_verbose_message(const char* fmt, ...);
+void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
+void       _mi_error_message(int err, const char* fmt, ...);
+
+// random.c
+void       _mi_random_init(mi_random_ctx_t* ctx);
+void       _mi_random_init_weak(mi_random_ctx_t* ctx);
+void       _mi_random_reinit_if_weak(mi_random_ctx_t * ctx);
+void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
+uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
+uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _mi_os_random_weak(uintptr_t extra_seed);
+static inline uintptr_t _mi_random_shuffle(uintptr_t x);
+
+// init.c
+extern mi_decl_cache_align mi_stats_t       _mi_stats_main;
+extern mi_decl_cache_align const mi_page_t  _mi_page_empty;
+bool       _mi_is_main_thread(void);
+size_t     _mi_current_thread_count(void);
+bool       _mi_preloading(void);  // true while the C runtime is not ready
+
+// os.c
+size_t     _mi_os_page_size(void);
+void       _mi_os_init(void);                                      // called from process init
+void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
+void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
+
+bool       _mi_os_protect(void* addr, size_t size);
+bool       _mi_os_unprotect(void* addr, size_t size);
+bool       _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
+bool       _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
+// bool       _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+size_t     _mi_os_good_alloc_size(size_t size);
+bool       _mi_os_has_overcommit(void);
+bool       _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats);
+
+void*      _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool* large, mi_stats_t* tld_stats);
+void       _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats);
+
+// arena.c
+void*      _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+void*      _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+void       _mi_arena_free(void* p, size_t size, size_t alignment, size_t align_offset, size_t memid, bool all_committed, mi_stats_t* stats);
+mi_arena_id_t _mi_arena_id_none(void);
+bool       _mi_arena_memid_is_suitable(size_t memid, mi_arena_id_t req_arena_id);
+
+// "segment-cache.c"
+void*      _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld);
+bool       _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld);
+void       _mi_segment_cache_collect(bool force, mi_os_tld_t* tld);
+void       _mi_segment_cache_free_all(mi_os_tld_t* tld);
+void       _mi_segment_map_allocated_at(const mi_segment_t* segment);
+void       _mi_segment_map_freed_at(const mi_segment_t* segment);
+
+// "segment.c"
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
+void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld);
+void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
+bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
+void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
+
+#if MI_HUGE_PAGE_ABANDON
+void       _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#else
+void       _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block);
+#endif
+
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page
+void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
+void       _mi_abandoned_await_readers(void);
+void       _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld);
+
+
+
+// "page.c"
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment)  mi_attr_noexcept mi_attr_malloc;
+
+void       _mi_page_retire(mi_page_t* page) mi_attr_noexcept;                  // free the page if there are no other pages with many free blocks
+void       _mi_page_unfull(mi_page_t* page);
+void       _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force);   // free the page
+void       _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq);            // abandon the page, to be picked up by another thread...
+void       _mi_heap_delayed_free_all(mi_heap_t* heap);
+bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
+void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
+
+void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
+size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
+void       _mi_deferred_free(mi_heap_t* heap, bool force);
+
+void       _mi_page_free_collect(mi_page_t* page,bool force);
+void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback from segments
+
+size_t     _mi_bin_size(uint8_t bin);           // for stats
+uint8_t    _mi_bin(size_t size);                // for stats
+
+// "heap.c"
+void       _mi_heap_destroy_pages(mi_heap_t* heap);
+void       _mi_heap_collect_abandon(mi_heap_t* heap);
+void       _mi_heap_set_default_direct(mi_heap_t* heap);
+bool       _mi_heap_memid_is_suitable(mi_heap_t* heap, size_t memid);
+void       _mi_heap_destroy_all(void);
+
+// "stats.c"
+void       _mi_stats_done(mi_stats_t* stats);
+
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);
+
+// "alloc.c"
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept;
+void*       _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept;     // called from `_mi_heap_malloc_aligned`
+void*       _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept;
+mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
+bool        _mi_free_delayed_block(mi_block_t* block);
+void        _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept;  // for runtime integration
+
+#if MI_DEBUG>1
+bool        _mi_page_is_valid(mi_page_t* page);
+#endif
+
+
+// ------------------------------------------------------
+// Branches
+// ------------------------------------------------------
+
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x)     (__builtin_expect(!!(x),false))
+#define mi_likely(x)       (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x)     (x) [[unlikely]]
+#define mi_likely(x)       (x) [[likely]]
+#else
+#define mi_unlikely(x)     (x)
+#define mi_likely(x)       (x)
+#endif
+
+#ifndef __has_builtin
+#define __has_builtin(x)  0
+#endif
+
+
+/* -----------------------------------------------------------
+  Error codes passed to `_mi_fatal_error`
+  All are recoverable but EFAULT is a serious error and aborts by default in secure mode.
+  For portability define undefined error codes using common Unix codes:
+  <https://www-numi.fnal.gov/offline_software/srt_public_context/WebDocs/Errors/unix_system_errors.html>
+----------------------------------------------------------- */
+#include <errno.h>
+#ifndef EAGAIN         // double free
+#define EAGAIN (11)
+#endif
+#ifndef ENOMEM         // out of memory
+#define ENOMEM (12)
+#endif
+#ifndef EFAULT         // corrupted free-list or meta-data
+#define EFAULT (14)
+#endif
+#ifndef EINVAL         // trying to free an invalid pointer
+#define EINVAL (22)
+#endif
+#ifndef EOVERFLOW      // count*size overflow
+#define EOVERFLOW (75)
+#endif
+
+
+/* -----------------------------------------------------------
+  Inlined definitions
+----------------------------------------------------------- */
+#define MI_UNUSED(x)     (void)(x)
+#if (MI_DEBUG>0)
+#define MI_UNUSED_RELEASE(x)
+#else
+#define MI_UNUSED_RELEASE(x)  MI_UNUSED(x)
+#endif
+
+#define MI_INIT4(x)   x(),x(),x(),x()
+#define MI_INIT8(x)   MI_INIT4(x),MI_INIT4(x)
+#define MI_INIT16(x)  MI_INIT8(x),MI_INIT8(x)
+#define MI_INIT32(x)  MI_INIT16(x),MI_INIT16(x)
+#define MI_INIT64(x)  MI_INIT32(x),MI_INIT32(x)
+#define MI_INIT128(x) MI_INIT64(x),MI_INIT64(x)
+#define MI_INIT256(x) MI_INIT128(x),MI_INIT128(x)
+
+
+// Is `x` a power of two? (0 is considered a power of two)
+static inline bool _mi_is_power_of_two(uintptr_t x) {
+  return ((x & (x - 1)) == 0);
+}
+
+// Is a pointer aligned?
+static inline bool _mi_is_aligned(void* p, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  return (((uintptr_t)p % alignment) == 0);
+}
+
+// Align upwards
+static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) {  // power of two?
+    return ((sz + mask) & ~mask);
+  }
+  else {
+    return (((sz + mask)/alignment)*alignment);
+  }
+}
+
+// Align downwards
+static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
+  uintptr_t mask = alignment - 1;
+  if ((alignment & mask) == 0) { // power of two?
+    return (sz & ~mask);
+  }
+  else {
+    return ((sz / alignment) * alignment);
+  }
+}
+
+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
+// Is memory zero initialized?
+static inline bool mi_mem_is_zero(void* p, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    if (((uint8_t*)p)[i] != 0) return false;
+  }
+  return true;
+}
+
+
+// Align a byte size to a size in _machine words_,
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline size_t _mi_wsize_from_size(size_t size) {
+  mi_assert_internal(size <= SIZE_MAX - sizeof(uintptr_t));
+  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
+}
+
+// Overflow detecting multiply
+#if __has_builtin(__builtin_umul_overflow) || (defined(__GNUC__) && (__GNUC__ >= 5))
+#include <limits.h>      // UINT_MAX, ULONG_MAX
+#if defined(_CLOCK_T)    // for Illumos
+#undef _CLOCK_T
+#endif
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #if (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, (unsigned long *)total);
+  #elif (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, (unsigned int *)total);
+  #else
+    return __builtin_umulll_overflow(count, size, (unsigned long long *)total);
+  #endif
+}
+#else /* __builtin_umul_overflow is unavailable */
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
+  *total = count * size;
+  // note: gcc/clang optimize this to directly check the overflow flag
+  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW) && size > 0 && (SIZE_MAX / size) < count);
+}
+#endif
+
+// Safe multiply `count*size` into `total`; return `true` on overflow.
+static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
+  if (count==1) {  // quick check for the case where count is one (common for C++ allocators)
+    *total = size;
+    return false;
+  }
+  else if mi_unlikely(mi_mul_overflow(count, size, total)) {
+    #if MI_DEBUG > 0
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
+    #endif
+    *total = SIZE_MAX;
+    return true;
+  }
+  else return false;
+}
+
+
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+	   loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323)
+------------------------------------------------------------------------------------------- */
+
+extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
+extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap
+
+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__APPLE__) // macOS
+#define MI_TLS_SLOT               89  // seems unused?
+// #define MI_TLS_RECURSE_GUARD 1
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16)
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)
+// #elif defined(__DragonFly__)
+// #warning "mimalloc is not working correctly on DragonFly yet."
+// #define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#elif defined(__ANDROID__)
+// See issue #381
+#define MI_TLS_PTHREAD
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+#elif defined(MI_TLS_PTHREAD)
+extern pthread_key_t _mi_heap_default_key;
+#endif
+
+// Default heap to allocate from (if not using TLS- or pthread slots).
+// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`).
+// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined.
+// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356).
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
+#if defined(MI_TLS_SLOT)
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
+  if mi_unlikely(heap == NULL) {
+    #ifdef __GNUC__
+    __asm(""); // prevent conditional load of the address of _mi_heap_empty
+    #endif
+    heap = (mi_heap_t*)&_mi_heap_empty;
+  }
+  return heap;
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#else
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
+  return _mi_heap_default;
+#endif
+}
+
+static inline bool mi_heap_is_default(const mi_heap_t* heap) {
+  return (heap == mi_get_default_heap());
+}
+
+static inline bool mi_heap_is_backing(const mi_heap_t* heap) {
+  return (heap->tld->heap_backing == heap);
+}
+
+static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
+  return (heap != &_mi_heap_empty);
+}
+
+static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
+  mi_assert_internal(_mi_heap_main.cookie != 0);
+  return ((uintptr_t)p ^ _mi_heap_main.cookie);
+}
+
+/* -----------------------------------------------------------
+  Pages
+----------------------------------------------------------- */
+
+static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
+}
+
+// Get the page belonging to a certain size class
+static inline mi_page_t* _mi_get_free_small_page(size_t size) {
+  return _mi_heap_get_free_small_page(mi_get_default_heap(), size);
+}
+
+// Segment that contains the pointer
+// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE),
+// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it;
+// therefore we align one byte before `p`.
+static inline mi_segment_t* _mi_ptr_segment(const void* p) {
+  mi_assert_internal(p != NULL);
+  return (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK);
+}
+
+static inline mi_page_t* mi_slice_to_page(mi_slice_t* s) {
+  mi_assert_internal(s->slice_offset== 0 && s->slice_count > 0);
+  return (mi_page_t*)(s);
+}
+
+static inline mi_slice_t* mi_page_to_slice(mi_page_t* p) {
+  mi_assert_internal(p->slice_offset== 0 && p->slice_count > 0);
+  return (mi_slice_t*)(p);
+}
+
+// Segment belonging to a page
+static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment == NULL || ((mi_slice_t*)page >= segment->slices && (mi_slice_t*)page < segment->slices + segment->slice_entries));
+  return segment;
+}
+
+static inline mi_slice_t* mi_slice_first(const mi_slice_t* slice) {
+  mi_slice_t* start = (mi_slice_t*)((uint8_t*)slice - slice->slice_offset);
+  mi_assert_internal(start >= _mi_ptr_segment(slice)->slices);
+  mi_assert_internal(start->slice_offset == 0);
+  mi_assert_internal(start + start->slice_count > slice);
+  return start;
+}
+
+// Get the page containing the pointer (performance critical as it is called in mi_free)
+static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
+  mi_assert_internal(p > (void*)segment);
+  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
+  mi_assert_internal(diff > 0 && diff <= (ptrdiff_t)MI_SEGMENT_SIZE);
+  size_t idx = (size_t)diff >> MI_SEGMENT_SLICE_SHIFT;
+  mi_assert_internal(idx <= segment->slice_entries);
+  mi_slice_t* slice0 = (mi_slice_t*)&segment->slices[idx];
+  mi_slice_t* slice = mi_slice_first(slice0);  // adjust to the block that holds the page data
+  mi_assert_internal(slice->slice_offset == 0);
+  mi_assert_internal(slice >= segment->slices && slice < segment->slices + segment->slice_entries);
+  return mi_slice_to_page(slice);
+}
+
+// Quick page start for initialized pages
+static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
+  return _mi_segment_page_start(segment, page, page_size);
+}
+
+// Get the page containing the pointer
+static inline mi_page_t* _mi_ptr_page(void* p) {
+  return _mi_segment_page_of(_mi_ptr_segment(p), p);
+}
+
+// Get the block size of a page (special case for huge objects)
+static inline size_t mi_page_block_size(const mi_page_t* page) {
+  const size_t bsize = page->xblock_size;
+  mi_assert_internal(bsize > 0);
+  if mi_likely(bsize < MI_HUGE_BLOCK_SIZE) {
+    return bsize;
+  }
+  else {
+    size_t psize;
+    _mi_segment_page_start(_mi_page_segment(page), page, &psize);
+    return psize;
+  }
+}
+
+static inline bool mi_page_is_huge(const mi_page_t* page) {
+  return (_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+}
+
+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+// size of a segment
+static inline size_t mi_segment_size(mi_segment_t* segment) {
+  return segment->segment_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static inline uint8_t* mi_segment_end(mi_segment_t* segment) {
+  return (uint8_t*)segment + mi_segment_size(segment);
+}
+
+// Thread free access
+static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
+  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+}
+
+static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
+  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
+}
+
+// Heap access
+static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
+  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+}
+
+static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
+}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~0x03);
+}
+static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
+  return (mi_delayed_t)(tf & 0x03);
+}
+static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
+  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
+}
+static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+  return mi_tf_make(mi_tf_block(tf),delayed);
+}
+static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+  return mi_tf_make(block, mi_tf_delayed(tf));
+}
+
+// are all blocks in a page freed?
+// note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
+static inline bool mi_page_all_free(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->used == 0);
+}
+
+// are there any available blocks?
+static inline bool mi_page_has_any_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL && page->reserved > 0);
+  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
+}
+
+// are there immediately available blocks, i.e. blocks available on the free list.
+static inline bool mi_page_immediate_available(const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  return (page->free != NULL);
+}
+
+// is more than 7/8th of a page in use?
+static inline bool mi_page_mostly_used(const mi_page_t* page) {
+  if (page==NULL) return true;
+  uint16_t frac = page->reserved / 8U;
+  return (page->reserved - page->used <= frac);
+}
+
+static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) {
+  return &((mi_heap_t*)heap)->pages[_mi_bin(size)];
+}
+
+
+
+//-----------------------------------------------------------
+// Page flags
+//-----------------------------------------------------------
+static inline bool mi_page_is_in_full(const mi_page_t* page) {
+  return page->flags.x.in_full;
+}
+
+static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
+  page->flags.x.in_full = in_full;
+}
+
+static inline bool mi_page_has_aligned(const mi_page_t* page) {
+  return page->flags.x.has_aligned;
+}
+
+static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
+  page->flags.x.has_aligned = has_aligned;
+}
+
+
+/* -------------------------------------------------------------------
+Encoding/Decoding the free list next pointers
+
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
+with a secret key `k1`, as `p^k1`. This prevents overwriting with known
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
+Moreover, if multiple blocks can be read as well, the attacker can
+xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
+about the pointers (and subsequently `k1`).
+
+Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
+Since these operations are not associative, the above approaches do not
+work so well any more even if the `p` can be guesstimated. For example,
+for the read case we can subtract two entries to discard the `+k1` term,
+but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
+We include the left-rotation since xor and addition are otherwise linear
+in the lowest bit. Finally, both keys are unique per page which reduces
+the re-use of keys by a large factor.
+
+We also pass a separate `null` value to be used as `NULL` or otherwise
+`(k2<<<k1)+k1` would appear (too) often as a sentinel value.
+------------------------------------------------------------------- */
+
+static inline bool mi_is_in_same_segment(const void* p, const void* q) {
+  return (_mi_ptr_segment(p) == _mi_ptr_segment(q));
+}
+
+static inline bool mi_is_in_same_page(const void* p, const void* q) {
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  if (_mi_ptr_segment(q) != segment) return false;
+  // assume q may be invalid // return (_mi_segment_page_of(segment, p) == _mi_segment_page_of(segment, q));
+  mi_page_t* page = _mi_segment_page_of(segment, p);
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  return (start <= (uint8_t*)q && (uint8_t*)q < start + psize);
+}
+
+static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
+}
+static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
+  shift %= MI_INTPTR_BITS;
+  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
+}
+
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (p==null ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(p==NULL ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
+  mi_track_mem_defined(block,sizeof(mi_block_t));
+  mi_block_t* next;
+  #ifdef MI_ENCODE_FREELIST
+  next = (mi_block_t*)mi_ptr_decode(null, block->next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  next = (mi_block_t*)block->next;
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+  return next;
+}
+
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
+  mi_track_mem_undefined(block,sizeof(mi_block_t));
+  #ifdef MI_ENCODE_FREELIST
+  block->next = mi_ptr_encode(null, next, keys);
+  #else
+  MI_UNUSED(keys); MI_UNUSED(null);
+  block->next = (mi_encoded_t)next;
+  #endif
+  mi_track_mem_noaccess(block,sizeof(mi_block_t));
+}
+
+static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
+  // check for free list corruption: is `next` at least in the same page?
+  // TODO: check if `next` is `page->block_size` aligned?
+  if mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next)) {
+    _mi_error_message(EFAULT, "corrupted free list entry of size %zub at %p: value 0x%zx\n", mi_page_block_size(page), block, (uintptr_t)next);
+    next = NULL;
+  }
+  return next;
+  #else
+  MI_UNUSED(page);
+  return mi_block_nextx(page,block,NULL);
+  #endif
+}
+
+static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
+  #ifdef MI_ENCODE_FREELIST
+  mi_block_set_nextx(page,block,next, page->keys);
+  #else
+  MI_UNUSED(page);
+  mi_block_set_nextx(page,block,next,NULL);
+  #endif
+}
+
+
+// -------------------------------------------------------------------
+// commit mask
+// -------------------------------------------------------------------
+
+static inline void mi_commit_mask_create_empty(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = 0;
+  }
+}
+
+static inline void mi_commit_mask_create_full(mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    cm->mask[i] = ~((size_t)0);
+  }
+}
+
+static inline bool mi_commit_mask_is_empty(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != 0) return false;
+  }
+  return true;
+}
+
+static inline bool mi_commit_mask_is_full(const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if (cm->mask[i] != ~((size_t)0)) return false;
+  }
+  return true;
+}
+
+// defined in `segment.c`:
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total);
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx);
+
+#define mi_commit_mask_foreach(cm,idx,count) \
+  idx = 0; \
+  while ((count = _mi_commit_mask_next_run(cm,&idx)) > 0) {
+
+#define mi_commit_mask_foreach_end() \
+    idx += count; \
+  }
+
+
+
+
+// -------------------------------------------------------------------
+// Fast "random" shuffle
+// -------------------------------------------------------------------
+
+static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
+  if (x==0) { x = 17; }   // ensure we don't get stuck in generating zeros
+#if (MI_INTPTR_SIZE==8)
+  // by Sebastiano Vigna, see: <http://xoshiro.di.unimi.it/splitmix64.c>
+  x ^= x >> 30;
+  x *= 0xbf58476d1ce4e5b9UL;
+  x ^= x >> 27;
+  x *= 0x94d049bb133111ebUL;
+  x ^= x >> 31;
+#elif (MI_INTPTR_SIZE==4)
+  // by Chris Wellons, see: <https://nullprogram.com/blog/2018/07/31/>
+  x ^= x >> 16;
+  x *= 0x7feb352dUL;
+  x ^= x >> 15;
+  x *= 0x846ca68bUL;
+  x ^= x >> 16;
+#endif
+  return x;
+}
+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern _Atomic(size_t) _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
+  if mi_likely(count > 0) { return count; }
+  else return _mi_os_numa_node_count_get();
+}
+
+
+// -------------------------------------------------------------------
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms.
+// We only require _mi_threadid() to return a unique id for each thread.
+// -------------------------------------------------------------------
+#if defined(_WIN32)
+
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  // Windows: works on Intel and ARM in both 32- and 64-bit
+  return (uintptr_t)NtCurrentTeb();
+}
+
+// We use assembly for a fast thread id on the main platforms. The TLS layout depends on
+// both the OS and libc implementation so we use specific tests for each main platform.
+// If you test on another platform and it works please send a PR :-)
+// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register.
+#elif defined(__GNUC__) && ( \
+	   (defined(__GLIBC__)   && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+	|| (defined(__APPLE__)   && (defined(__x86_64__) || defined(__aarch64__))) \
+	|| (defined(__BIONIC__)  && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \
+	|| (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+	|| (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \
+      )
+
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    res = tcb[slot];
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    res = tcb[slot];
+  #endif
+  return res;
+}
+
+// setting a tls slot is only used on macOS for now
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+  #if defined(__i386__)
+    __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+  #elif defined(__APPLE__) && defined(__x86_64__)
+    __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOS uses GS
+  #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+    __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
+  #elif defined(__x86_64__)
+    __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+  #elif defined(__arm__)
+    void** tcb; MI_UNUSED(ofs);
+    __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+    tcb[slot] = value;
+  #elif defined(__aarch64__)
+    void** tcb; MI_UNUSED(ofs);
+    #if defined(__APPLE__) // M1, issue #343
+    __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb));
+    #else
+    __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+    #endif
+    tcb[slot] = value;
+  #endif
+}
+
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  #if defined(__BIONIC__)
+    // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id
+    // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86
+    return (uintptr_t)mi_tls_slot(1);
+  #else
+    // in all our other targets, slot 0 is the thread id
+    // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h
+    // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36
+    return (uintptr_t)mi_tls_slot(0);
+  #endif
+}
+
+#else
+
+// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms).
+static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept {
+  return (uintptr_t)&_mi_heap_default;
+}
+
+#endif
+
+
+// -----------------------------------------------------------------------
+// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
+// -----------------------------------------------------------------------
+
+#if defined(__GNUC__)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_clzl(x);
+#else
+  return __builtin_clzll(x);
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_ctzl(x);
+#else
+  return __builtin_ctzll(x);
+#endif
+}
+
+#elif defined(_MSC_VER)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanReverse(&idx, x);
+#else
+  _BitScanReverse64(&idx, x);
+#endif
+  return ((MI_INTPTR_BITS - 1) - idx);
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanForward(&idx, x);
+#else
+  _BitScanForward64(&idx, x);
+#endif
+  return idx;
+}
+
+#else
+static inline size_t mi_ctz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const unsigned char debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+}
+static inline size_t mi_clz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+}
+
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_clz32((uint32_t)x);
+#else
+  size_t count = mi_clz32((uint32_t)(x >> 32));
+  if (count < 32) return count;
+  return (32 + mi_clz32((uint32_t)x));
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_ctz32((uint32_t)x);
+#else
+  size_t count = mi_ctz32((uint32_t)x);
+  if (count < 32) return count;
+  return (32 + mi_ctz32((uint32_t)(x>>32)));
+#endif
+}
+
+#endif
+
+// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
+}
+
+
+// ---------------------------------------------------------------------------------
+// Provide our own `_mi_memcpy` for potential performance optimizations.
+//
+// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if
+// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support
+// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
+// ---------------------------------------------------------------------------------
+
+#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
+#include <intrin.h>
+#include <string.h>
+extern bool _mi_cpu_has_fsrm;
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __movsb((unsigned char*)dst, (const unsigned char*)src, n);
+  }
+  else {
+    memcpy(dst, src, n);
+  }
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  if (_mi_cpu_has_fsrm) {
+    __stosb((unsigned char*)dst, 0, n);
+  }
+  else {
+    memset(dst, 0, n);
+  }
+}
+#else
+#include <string.h>
+static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
+  memcpy(dst, src, n);
+}
+static inline void _mi_memzero(void* dst, size_t n) {
+  memset(dst, 0, n);
+}
+#endif
+
+
+// -------------------------------------------------------------------------------
+// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
+// This is used for example in `mi_realloc`.
+// -------------------------------------------------------------------------------
+
+#if (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)
+// On GCC/CLang we provide a hint that the pointers are word aligned.
+#include <string.h>
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
+  _mi_memcpy(adst, asrc, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
+  _mi_memzero(adst, n);
+}
+#else
+// Default fallback on `_mi_memcpy`
+static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
+  mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
+  _mi_memcpy(dst, src, n);
+}
+
+static inline void _mi_memzero_aligned(void* dst, size_t n) {
+  mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0);
+  _mi_memzero(dst, n);
+}
+#endif
+
+
+#endif
diff --git a/compat/mimalloc/mimalloc-track.h b/compat/mimalloc/mimalloc-track.h
new file mode 100644
index 0000000000..f60d7acd0b
--- /dev/null
+++ b/compat/mimalloc/mimalloc-track.h
@@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TRACK_H
+#define MIMALLOC_TRACK_H
+
+// ------------------------------------------------------
+// Track memory ranges with macros for tools like Valgrind
+// address sanitizer, or other memory checkers.
+// ------------------------------------------------------
+
+#if MI_VALGRIND
+
+#define MI_TRACK_ENABLED 1
+#define MI_TRACK_TOOL    "valgrind"
+
+#include <valgrind/valgrind.h>
+#include <valgrind/memcheck.h>
+
+#define mi_track_malloc(p,size,zero)        VALGRIND_MALLOCLIKE_BLOCK(p,size,MI_PADDING_SIZE /*red zone*/,zero)
+#define mi_track_resize(p,oldsize,newsize)  VALGRIND_RESIZEINPLACE_BLOCK(p,oldsize,newsize,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_free(p)                    VALGRIND_FREELIKE_BLOCK(p,MI_PADDING_SIZE /*red zone*/)
+#define mi_track_free_size(p,_size)         mi_track_free(p)
+#define mi_track_mem_defined(p,size)        VALGRIND_MAKE_MEM_DEFINED(p,size)
+#define mi_track_mem_undefined(p,size)      VALGRIND_MAKE_MEM_UNDEFINED(p,size)
+#define mi_track_mem_noaccess(p,size)       VALGRIND_MAKE_MEM_NOACCESS(p,size)
+
+#elif MI_ASAN
+
+#define MI_TRACK_ENABLED 1
+#define MI_TRACK_TOOL    "asan"
+
+#include <sanitizer/asan_interface.h>
+
+#define mi_track_malloc(p,size,zero)        ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_resize(p,oldsize,newsize)  ASAN_POISON_MEMORY_REGION(p,oldsize); ASAN_UNPOISON_MEMORY_REGION(p,newsize)
+#define mi_track_free(p)                    ASAN_POISON_MEMORY_REGION(p,mi_usable_size(p))
+#define mi_track_free_size(p,size)          ASAN_POISON_MEMORY_REGION(p,size)
+#define mi_track_mem_defined(p,size)        ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_undefined(p,size)      ASAN_UNPOISON_MEMORY_REGION(p,size)
+#define mi_track_mem_noaccess(p,size)       ASAN_POISON_MEMORY_REGION(p,size)
+
+#else
+
+#define MI_TRACK_ENABLED 0
+#define MI_TRACK_TOOL    "none"
+
+#define mi_track_malloc(p,size,zero)
+#define mi_track_resize(p,oldsize,newsize)
+#define mi_track_free(p)
+#define mi_track_free_size(p,_size)
+#define mi_track_mem_defined(p,size)
+#define mi_track_mem_undefined(p,size)
+#define mi_track_mem_noaccess(p,size)
+
+#endif
+
+#endif
diff --git a/compat/mimalloc/mimalloc-types.h b/compat/mimalloc/mimalloc-types.h
new file mode 100644
index 0000000000..7467945bbc
--- /dev/null
+++ b/compat/mimalloc/mimalloc-types.h
@@ -0,0 +1,609 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_TYPES_H
+#define MIMALLOC_TYPES_H
+
+#include <stddef.h>   // ptrdiff_t
+#include <stdint.h>   // uintptr_t, uint16_t, etc
+#include "mimalloc-atomic.h"  // _Atomic
+
+#ifdef _MSC_VER
+#pragma warning(disable:4214) // bitfield is not int
+#endif
+
+// Minimal alignment necessary. On most platforms 16 bytes are needed
+// due to SSE registers for example. This must be at least `sizeof(void*)`
+#ifndef MI_MAX_ALIGN_SIZE
+#define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif
+
+// ------------------------------------------------------
+// Variants
+// ------------------------------------------------------
+
+// Define NDEBUG in the release version to disable assertions.
+// #define NDEBUG
+
+// Define MI_VALGRIND to enable valgrind support
+// #define MI_VALGRIND 1
+
+// Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance).
+// #define MI_STAT 1
+
+// Define MI_SECURE to enable security mitigations
+// #define MI_SECURE 1  // guard page around metadata
+// #define MI_SECURE 2  // guard page around each mimalloc page
+// #define MI_SECURE 3  // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free)
+// #define MI_SECURE 4  // checks for double free. (may be more expensive)
+
+#if !defined(MI_SECURE)
+#define MI_SECURE 0
+#endif
+
+// Define MI_DEBUG for debug mode
+// #define MI_DEBUG 1  // basic assertion checks and statistics, check double free, corrupted free list, and invalid pointer free.
+// #define MI_DEBUG 2  // + internal assertion checks
+// #define MI_DEBUG 3  // + extensive internal invariant checking (cmake -DMI_DEBUG_FULL=ON)
+#if !defined(MI_DEBUG)
+#if !defined(NDEBUG) || defined(_DEBUG)
+#define MI_DEBUG 2
+#else
+#define MI_DEBUG 0
+#endif
+#endif
+
+// Reserve extra padding at the end of each block to be more resilient against heap block overflows.
+// The padding can detect byte-precise buffer overflow on free.
+#if !defined(MI_PADDING) && (MI_DEBUG>=1 || MI_VALGRIND)
+#define MI_PADDING  1
+#endif
+
+
+// Encoded free lists allow detection of corrupted free lists
+// and can detect buffer overflows, modify after free, and double `free`s.
+#if (MI_SECURE>=3 || MI_DEBUG>=1)
+#define MI_ENCODE_FREELIST  1
+#endif
+
+
+// We used to abandon huge pages but to eagerly deallocate if freed from another thread,
+// but that makes it not possible to visit them during a heap walk or include them in a
+// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks if freed from
+// another thread so most memory is available until it gets properly freed by the owning thread.
+// #define MI_HUGE_PAGE_ABANDON 1
+
+
+// ------------------------------------------------------
+// Platform specific values
+// ------------------------------------------------------
+
+// ------------------------------------------------------
+// Size of a pointer.
+// We assume that `sizeof(void*)==sizeof(intptr_t)`
+// and it holds for all platforms we know of.
+//
+// However, the C standard only requires that:
+//  p == (void*)((intptr_t)p))
+// but we also need:
+//  i == (intptr_t)((void*)i)
+// or otherwise one might define an intptr_t type that is larger than a pointer...
+// ------------------------------------------------------
+
+#if INTPTR_MAX > INT64_MAX
+# define MI_INTPTR_SHIFT (4)  // assume 128-bit  (as on arm CHERI for example)
+#elif INTPTR_MAX == INT64_MAX
+# define MI_INTPTR_SHIFT (3)
+#elif INTPTR_MAX == INT32_MAX
+# define MI_INTPTR_SHIFT (2)
+#else
+#error platform pointers must be 32, 64, or 128 bits
+#endif
+
+#if SIZE_MAX == UINT64_MAX
+# define MI_SIZE_SHIFT (3)
+typedef int64_t  mi_ssize_t;
+#elif SIZE_MAX == UINT32_MAX
+# define MI_SIZE_SHIFT (2)
+typedef int32_t  mi_ssize_t;
+#else
+#error platform objects must be 32 or 64 bits
+#endif
+
+#if (SIZE_MAX/2) > LONG_MAX
+# define MI_ZU(x)  x##ULL
+# define MI_ZI(x)  x##LL
+#else
+# define MI_ZU(x)  x##UL
+# define MI_ZI(x)  x##L
+#endif
+
+#define MI_INTPTR_SIZE  (1<<MI_INTPTR_SHIFT)
+#define MI_INTPTR_BITS  (MI_INTPTR_SIZE*8)
+
+#define MI_SIZE_SIZE  (1<<MI_SIZE_SHIFT)
+#define MI_SIZE_BITS  (MI_SIZE_SIZE*8)
+
+#define MI_KiB     (MI_ZU(1024))
+#define MI_MiB     (MI_KiB*MI_KiB)
+#define MI_GiB     (MI_MiB*MI_KiB)
+
+
+// ------------------------------------------------------
+// Main internal data-structures
+// ------------------------------------------------------
+
+// Main tuning parameters for segment and page sizes
+// Sizes for 64-bit (usually divide by two for 32-bit)
+#define MI_SEGMENT_SLICE_SHIFT            (13 + MI_INTPTR_SHIFT)         // 64KiB  (32KiB on 32-bit)
+
+#if MI_INTPTR_SIZE > 4
+#define MI_SEGMENT_SHIFT                  ( 9 + MI_SEGMENT_SLICE_SHIFT)  // 32MiB
+#else
+#define MI_SEGMENT_SHIFT                  ( 7 + MI_SEGMENT_SLICE_SHIFT)  // 4MiB on 32-bit
+#endif
+
+#define MI_SMALL_PAGE_SHIFT               (MI_SEGMENT_SLICE_SHIFT)       // 64KiB
+#define MI_MEDIUM_PAGE_SHIFT              ( 3 + MI_SMALL_PAGE_SHIFT)     // 512KiB
+
+
+// Derived constants
+#define MI_SEGMENT_SIZE                   (MI_ZU(1)<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_ALIGN                  MI_SEGMENT_SIZE
+#define MI_SEGMENT_MASK                   (MI_SEGMENT_ALIGN - 1)
+#define MI_SEGMENT_SLICE_SIZE             (MI_ZU(1)<< MI_SEGMENT_SLICE_SHIFT)
+#define MI_SLICES_PER_SEGMENT             (MI_SEGMENT_SIZE / MI_SEGMENT_SLICE_SIZE) // 1024
+
+#define MI_SMALL_PAGE_SIZE                (MI_ZU(1)<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (MI_ZU(1)<<MI_MEDIUM_PAGE_SHIFT)
+
+#define MI_SMALL_OBJ_SIZE_MAX             (MI_SMALL_PAGE_SIZE/4)   // 8KiB on 64-bit
+#define MI_MEDIUM_OBJ_SIZE_MAX            (MI_MEDIUM_PAGE_SIZE/4)  // 128KiB on 64-bit
+#define MI_MEDIUM_OBJ_WSIZE_MAX           (MI_MEDIUM_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+#define MI_LARGE_OBJ_SIZE_MAX             (MI_SEGMENT_SIZE/2)      // 32MiB on 64-bit
+#define MI_LARGE_OBJ_WSIZE_MAX            (MI_LARGE_OBJ_SIZE_MAX/MI_INTPTR_SIZE)
+
+// Maximum number of size classes. (spaced exponentially in 12.5% increments)
+#define MI_BIN_HUGE  (73U)
+
+#if (MI_MEDIUM_OBJ_WSIZE_MAX >= 655360)
+#error "mimalloc internal: define more bins"
+#endif
+
+// Maximum slice offset (15)
+#define MI_MAX_SLICE_OFFSET               ((MI_ALIGNMENT_MAX / MI_SEGMENT_SLICE_SIZE) - 1)
+
+// Used as a special value to encode block sizes in 32 bits.
+#define MI_HUGE_BLOCK_SIZE                ((uint32_t)(2*MI_GiB))
+
+// blocks up to this size are always allocated aligned
+#define MI_MAX_ALIGN_GUARANTEE            (8*MI_MAX_ALIGN_SIZE)
+
+// Alignments over MI_ALIGNMENT_MAX are allocated in dedicated huge page segments
+#define MI_ALIGNMENT_MAX                  (MI_SEGMENT_SIZE >> 1)
+
+
+// ------------------------------------------------------
+// Mimalloc pages contain allocated blocks
+// ------------------------------------------------------
+
+// The free lists use encoded next fields
+// (Only actually encodes when MI_ENCODED_FREELIST is defined.)
+typedef uintptr_t  mi_encoded_t;
+
+// thread id's
+typedef size_t     mi_threadid_t;
+
+// free lists contain blocks
+typedef struct mi_block_s {
+  mi_encoded_t next;
+} mi_block_t;
+
+
+// The delayed flags are used for efficient multi-threaded free-ing
+typedef enum mi_delayed_e {
+  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
+  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
+  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
+  MI_NEVER_DELAYED_FREE = 3  // sticky, only resets on page reclaim
+} mi_delayed_t;
+
+
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently
+// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
+typedef union mi_page_flags_s {
+  uint8_t full_aligned;
+  struct {
+    uint8_t in_full : 1;
+    uint8_t has_aligned : 1;
+  } x;
+} mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint16_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif
+
+// Thread free list.
+// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+typedef uintptr_t mi_thread_free_t;
+
+// A page contains blocks of one specific size (`block_size`).
+// Each page has three list of free blocks:
+// `free` for blocks that can be allocated,
+// `local_free` for freed blocks that are not yet available to `mi_malloc`
+// `thread_free` for freed blocks by other threads
+// The `local_free` and `thread_free` lists are migrated to the `free` list
+// when it is exhausted. The separate `local_free` list is necessary to
+// implement a monotonic heartbeat. The `thread_free` list is needed for
+// avoiding atomic operations in the common case.
+//
+//
+// `used - |thread_free|` == actual blocks that are in use (alive)
+// `used - |thread_free| + |free| + |local_free| == capacity`
+//
+// We don't count `freed` (as |free|) but use `used` to reduce
+// the number of memory accesses in the `mi_page_all_free` function(s).
+//
+// Notes:
+// - Access is optimized for `mi_free` and `mi_page_alloc` (in `alloc.c`)
+// - Using `uint16_t` does not seem to slow things down
+// - The size is 8 words on 64-bit which helps the page index calculations
+//   (and 10 words on 32-bit, and encoded free lists add 2 words. Sizes 10
+//    and 12 are still good for address calculation)
+// - To limit the structure size, the `xblock_size` is 32-bits only; for
+//   blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size
+// - `thread_free` uses the bottom bits as a delayed-free flags to optimize
+//   concurrent frees where only the first concurrent free adds to the owning
+//   heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`).
+//   The invariant is that no-delayed-free is only set if there is
+//   at least one block that will be added, or as already been added, to
+//   the owning heap `thread_delayed_free` list. This guarantees that pages
+//   will be freed correctly even if only other threads free blocks.
+typedef struct mi_page_s {
+  // "owned" by the segment
+  uint32_t              slice_count;       // slices in this page (0 if not a page)
+  uint32_t              slice_offset;      // distance from the actual page data slice (0 if a page)
+  uint8_t               is_reset : 1;      // `true` if the page memory was reset
+  uint8_t               is_committed : 1;  // `true` if the page virtual memory is committed
+  uint8_t               is_zero_init : 1;  // `true` if the page was zero initialized
+
+  // layout like this to optimize access in `mi_malloc` and `mi_free`
+  uint16_t              capacity;          // number of blocks committed, must be the first field, see `segment.c:page_clear`
+  uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
+  uint8_t               is_zero : 1;       // `true` if the blocks in the free list are zero initialized
+  uint8_t               retire_expire : 7; // expiration count for retired blocks
+
+  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
+  uint32_t              used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
+  uint32_t              xblock_size;       // size available in each block (always `>0`)
+  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+
+  #ifdef MI_ENCODE_FREELIST
+  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`)
+  #endif
+
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
+
+  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
+  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
+
+  // 64-bit 9 words, 32-bit 12 words, (+2 for secure)
+  #if MI_INTPTR_SIZE==8
+  uintptr_t padding[1];
+  #endif
+} mi_page_t;
+
+
+
+typedef enum mi_page_kind_e {
+  MI_PAGE_SMALL,    // small blocks go into 64KiB pages inside a segment
+  MI_PAGE_MEDIUM,   // medium blocks go into medium pages inside a segment
+  MI_PAGE_LARGE,    // larger blocks go into a page of just one block
+  MI_PAGE_HUGE,     // huge blocks (> 16 MiB) are put into a single page in a single segment.
+} mi_page_kind_t;
+
+typedef enum mi_segment_kind_e {
+  MI_SEGMENT_NORMAL, // MI_SEGMENT_SIZE size with pages inside.
+  MI_SEGMENT_HUGE,   // > MI_LARGE_SIZE_MAX segment with just one huge page inside.
+} mi_segment_kind_t;
+
+// ------------------------------------------------------
+// A segment holds a commit mask where a bit is set if
+// the corresponding MI_COMMIT_SIZE area is committed.
+// The MI_COMMIT_SIZE must be a multiple of the slice
+// size. If it is equal we have the most fine grained
+// decommit (but setting it higher can be more efficient).
+// The MI_MINIMAL_COMMIT_SIZE is the minimal amount that will
+// be committed in one go which can be set higher than
+// MI_COMMIT_SIZE for efficiency (while the decommit mask
+// is still tracked in fine-grained MI_COMMIT_SIZE chunks)
+// ------------------------------------------------------
+
+#define MI_MINIMAL_COMMIT_SIZE      (16*MI_SEGMENT_SLICE_SIZE)           // 1MiB
+#define MI_COMMIT_SIZE              (MI_SEGMENT_SLICE_SIZE)              // 64KiB
+#define MI_COMMIT_MASK_BITS         (MI_SEGMENT_SIZE / MI_COMMIT_SIZE)
+#define MI_COMMIT_MASK_FIELD_BITS    MI_SIZE_BITS
+#define MI_COMMIT_MASK_FIELD_COUNT  (MI_COMMIT_MASK_BITS / MI_COMMIT_MASK_FIELD_BITS)
+
+#if (MI_COMMIT_MASK_BITS != (MI_COMMIT_MASK_FIELD_COUNT * MI_COMMIT_MASK_FIELD_BITS))
+#error "the segment size must be exactly divisible by the (commit size * size_t bits)"
+#endif
+
+typedef struct mi_commit_mask_s {
+  size_t mask[MI_COMMIT_MASK_FIELD_COUNT];
+} mi_commit_mask_t;
+
+typedef mi_page_t  mi_slice_t;
+typedef int64_t    mi_msecs_t;
+
+
+// Segments are large allocated memory blocks (8mb on 64 bit) from
+// the OS. Inside segments we allocated fixed size _pages_ that
+// contain blocks.
+typedef struct mi_segment_s {
+  size_t            memid;              // memory id for arena allocation
+  bool              mem_is_pinned;      // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
+  bool              mem_is_large;       // in large/huge os pages?
+  bool              mem_is_committed;   // `true` if the whole segment is eagerly committed
+  size_t            mem_alignment;      // page alignment for huge pages (only used for alignment > MI_ALIGNMENT_MAX)
+  size_t            mem_align_offset;   // offset for huge page alignment (only used for alignment > MI_ALIGNMENT_MAX)
+
+  bool              allow_decommit;
+  mi_msecs_t        decommit_expire;
+  mi_commit_mask_t  decommit_mask;
+  mi_commit_mask_t  commit_mask;
+
+  _Atomic(struct mi_segment_s*) abandoned_next;
+
+  // from here is zero initialized
+  struct mi_segment_s* next;            // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`)
+
+  size_t            abandoned;          // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t            abandoned_visits;   // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+  size_t            used;               // count of pages in use
+  uintptr_t         cookie;             // verify addresses in debug mode: `mi_ptr_cookie(segment) == segment->cookie`
+
+  size_t            segment_slices;      // for huge segments this may be different from `MI_SLICES_PER_SEGMENT`
+  size_t            segment_info_slices; // initial slices we are using segment info and possible guard pages.
+
+  // layout like this to optimize access in `mi_free`
+  mi_segment_kind_t kind;
+  size_t            slice_entries;       // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT`
+  _Atomic(mi_threadid_t) thread_id;      // unique id of the thread owning this segment
+
+  mi_slice_t        slices[MI_SLICES_PER_SEGMENT+1];  // one more for huge blocks with large alignment
+} mi_segment_t;
+
+
+// ------------------------------------------------------
+// Heaps
+// Provide first-class heaps to allocate from.
+// A heap just owns a set of pages for allocation and
+// can only be allocate/reallocate from the thread that created it.
+// Freeing blocks can be done from any thread though.
+// Per thread, the segments are shared among its heaps.
+// Per thread, there is always a default heap that is
+// used for allocation; it is initialized to statically
+// point to an empty heap to avoid initialization checks
+// in the fast path.
+// ------------------------------------------------------
+
+// Thread local data
+typedef struct mi_tld_s mi_tld_t;
+
+// Pages of a certain block size are held in a queue.
+typedef struct mi_page_queue_s {
+  mi_page_t* first;
+  mi_page_t* last;
+  size_t     block_size;
+} mi_page_queue_t;
+
+#define MI_BIN_FULL  (MI_BIN_HUGE+1)
+
+// Random context
+typedef struct mi_random_cxt_s {
+  uint32_t input[16];
+  uint32_t output[16];
+  int      output_available;
+  bool     weak;
+} mi_random_ctx_t;
+
+
+// In debug mode there is a padding structure at the end of the blocks to check for buffer overflows
+#if (MI_PADDING)
+typedef struct mi_padding_s {
+  uint32_t canary; // encoded block value to check validity of the padding (in case of overflow)
+  uint32_t delta;  // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes)
+} mi_padding_t;
+#define MI_PADDING_SIZE   (sizeof(mi_padding_t))
+#define MI_PADDING_WSIZE  ((MI_PADDING_SIZE + MI_INTPTR_SIZE - 1) / MI_INTPTR_SIZE)
+#else
+#define MI_PADDING_SIZE   0
+#define MI_PADDING_WSIZE  0
+#endif
+
+#define MI_PAGES_DIRECT   (MI_SMALL_WSIZE_MAX + MI_PADDING_WSIZE + 1)
+
+
+// A heap owns a set of pages.
+struct mi_heap_s {
+  mi_tld_t*             tld;
+  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
+  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
+  _Atomic(mi_block_t*)  thread_delayed_free;
+  mi_threadid_t         thread_id;                           // thread this heap belongs too
+  mi_arena_id_t         arena_id;                            // arena id if the heap belongs to a specific arena (or 0)
+  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
+  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
+  mi_random_ctx_t       random;                              // random number context used for secure allocation
+  size_t                page_count;                          // total number of pages in the `pages` queues.
+  size_t                page_retired_min;                    // smallest retired index (retired pages are fully free, but still in the page queues)
+  size_t                page_retired_max;                    // largest retired index into the `pages` array.
+  mi_heap_t*            next;                                // list of heaps per thread
+  bool                  no_reclaim;                          // `true` if this heap should not reclaim abandoned pages
+};
+
+
+
+// ------------------------------------------------------
+// Debug
+// ------------------------------------------------------
+
+#if !defined(MI_DEBUG_UNINIT)
+#define MI_DEBUG_UNINIT     (0xD0)
+#endif
+#if !defined(MI_DEBUG_FREED)
+#define MI_DEBUG_FREED      (0xDF)
+#endif
+#if !defined(MI_DEBUG_PADDING)
+#define MI_DEBUG_PADDING    (0xDE)
+#endif
+
+#if (MI_DEBUG)
+// use our own assertion to print without memory allocation
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func );
+#define mi_assert(expr)     ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__))
+#else
+#define mi_assert(x)
+#endif
+
+#if (MI_DEBUG>1)
+#define mi_assert_internal    mi_assert
+#else
+#define mi_assert_internal(x)
+#endif
+
+#if (MI_DEBUG>2)
+#define mi_assert_expensive   mi_assert
+#else
+#define mi_assert_expensive(x)
+#endif
+
+// ------------------------------------------------------
+// Statistics
+// ------------------------------------------------------
+
+#ifndef MI_STAT
+#if (MI_DEBUG>0)
+#define MI_STAT 2
+#else
+#define MI_STAT 0
+#endif
+#endif
+
+typedef struct mi_stat_count_s {
+  int64_t allocated;
+  int64_t freed;
+  int64_t peak;
+  int64_t current;
+} mi_stat_count_t;
+
+typedef struct mi_stat_counter_s {
+  int64_t total;
+  int64_t count;
+} mi_stat_counter_t;
+
+typedef struct mi_stats_s {
+  mi_stat_count_t segments;
+  mi_stat_count_t pages;
+  mi_stat_count_t reserved;
+  mi_stat_count_t committed;
+  mi_stat_count_t reset;
+  mi_stat_count_t page_committed;
+  mi_stat_count_t segments_abandoned;
+  mi_stat_count_t pages_abandoned;
+  mi_stat_count_t threads;
+  mi_stat_count_t normal;
+  mi_stat_count_t huge;
+  mi_stat_count_t large;
+  mi_stat_count_t malloc;
+  mi_stat_count_t segments_cache;
+  mi_stat_counter_t pages_extended;
+  mi_stat_counter_t mmap_calls;
+  mi_stat_counter_t commit_calls;
+  mi_stat_counter_t page_no_retire;
+  mi_stat_counter_t searches;
+  mi_stat_counter_t normal_count;
+  mi_stat_counter_t huge_count;
+  mi_stat_counter_t large_count;
+#if MI_STAT>1
+  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
+#endif
+} mi_stats_t;
+
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount);
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
+
+#if (MI_STAT)
+#define mi_stat_increase(stat,amount)         _mi_stat_increase( &(stat), amount)
+#define mi_stat_decrease(stat,amount)         _mi_stat_decrease( &(stat), amount)
+#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount)
+#else
+#define mi_stat_increase(stat,amount)         (void)0
+#define mi_stat_decrease(stat,amount)         (void)0
+#define mi_stat_counter_increase(stat,amount) (void)0
+#endif
+
+#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
+#define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)
+
+// ------------------------------------------------------
+// Thread Local data
+// ------------------------------------------------------
+
+// A "span" is is an available range of slices. The span queues keep
+// track of slice spans of at most the given `slice_count` (but more than the previous size class).
+typedef struct mi_span_queue_s {
+  mi_slice_t* first;
+  mi_slice_t* last;
+  size_t      slice_count;
+} mi_span_queue_t;
+
+#define MI_SEGMENT_BIN_MAX (35)     // 35 == mi_segment_bin(MI_SLICES_PER_SEGMENT)
+
+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;
+
+
+// Segments thread local data
+typedef struct mi_segments_tld_s {
+  mi_span_queue_t     spans[MI_SEGMENT_BIN_MAX+1];  // free slice spans inside segments
+  size_t              count;        // current number of segments;
+  size_t              peak_count;   // peak number of segments
+  size_t              current_size; // current size of all segments
+  size_t              peak_size;    // peak size of all segments
+  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
+} mi_segments_tld_t;
+
+// Thread local data
+struct mi_tld_s {
+  unsigned long long  heartbeat;     // monotonic heartbeat count
+  bool                recurse;       // true if deferred was called; used to prevent infinite recursion.
+  mi_heap_t*          heap_backing;  // backing heap of this thread (cannot be deleted)
+  mi_heap_t*          heaps;         // list of heaps in this thread (so we can abandon all when the thread terminates)
+  mi_segments_tld_t   segments;      // segment tld
+  mi_os_tld_t         os;            // os tld
+  mi_stats_t          stats;         // statistics
+};
+
+#endif
diff --git a/compat/mimalloc/mimalloc.h b/compat/mimalloc/mimalloc.h
new file mode 100644
index 0000000000..d4e96cba4f
--- /dev/null
+++ b/compat/mimalloc/mimalloc.h
@@ -0,0 +1,558 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2022, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#pragma once
+#ifndef MIMALLOC_H
+#define MIMALLOC_H
+
+#define MI_MALLOC_VERSION 209   // major + 2 digits minor
+
+// ------------------------------------------------------
+// Compiler specific attributes
+// ------------------------------------------------------
+
+#ifdef __cplusplus
+  #if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+    #define mi_attr_noexcept   noexcept
+  #else
+    #define mi_attr_noexcept   throw()
+  #endif
+#else
+  #define mi_attr_noexcept
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201703)
+  #define mi_decl_nodiscard    [[nodiscard]]
+#elif (defined(__GNUC__) && (__GNUC__ >= 4)) || defined(__clang__)  // includes clang, icc, and clang-cl
+  #define mi_decl_nodiscard    __attribute__((warn_unused_result))
+#elif defined(_HAS_NODISCARD)
+  #define mi_decl_nodiscard    _NODISCARD
+#elif (_MSC_VER >= 1700)
+  #define mi_decl_nodiscard    _Check_return_
+#else
+  #define mi_decl_nodiscard
+#endif
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  #if !defined(MI_SHARED_LIB)
+    #define mi_decl_export
+  #elif defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __declspec(dllexport)
+  #else
+    #define mi_decl_export              __declspec(dllimport)
+  #endif
+  #if defined(__MINGW32__)
+    #define mi_decl_restrict
+    #define mi_attr_malloc              __attribute__((malloc))
+  #else
+    #if (_MSC_VER >= 1900) && !defined(__EDG__)
+      #define mi_decl_restrict          __declspec(allocator) __declspec(restrict)
+    #else
+      #define mi_decl_restrict          __declspec(restrict)
+    #endif
+    #define mi_attr_malloc
+  #endif
+  #define mi_cdecl                      __cdecl
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#elif defined(__GNUC__)                 // includes clang and icc
+  #if defined(MI_SHARED_LIB) && defined(MI_SHARED_LIB_EXPORT)
+    #define mi_decl_export              __attribute__((visibility("default")))
+  #else
+    #define mi_decl_export
+  #endif
+  #define mi_cdecl                      // leads to warnings... __attribute__((cdecl))
+  #define mi_decl_restrict
+  #define mi_attr_malloc                __attribute__((malloc))
+  #if (defined(__clang_major__) && (__clang_major__ < 4)) || (__GNUC__ < 5)
+    #define mi_attr_alloc_size(s)
+    #define mi_attr_alloc_size2(s1,s2)
+    #define mi_attr_alloc_align(p)
+  #elif defined(__INTEL_COMPILER)
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)
+  #else
+    #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
+    #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
+    #define mi_attr_alloc_align(p)      __attribute__((alloc_align(p)))
+  #endif
+#else
+  #define mi_cdecl
+  #define mi_decl_export
+  #define mi_decl_restrict
+  #define mi_attr_malloc
+  #define mi_attr_alloc_size(s)
+  #define mi_attr_alloc_size2(s1,s2)
+  #define mi_attr_alloc_align(p)
+#endif
+
+// ------------------------------------------------------
+// Includes
+// ------------------------------------------------------
+
+#include "git-compat-util.h"
+
+#include <stdbool.h>    // bool
+#include <stdint.h>     // INTPTR_MAX
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------------
+// Standard malloc interface
+// ------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc(size_t count, size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc(void* p, size_t newsize)      mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_export void* mi_expand(void* p, size_t newsize)                         mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_export void mi_free(void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+// ------------------------------------------------------
+// Extended functionality
+// ------------------------------------------------------
+#define MI_SMALL_WSIZE_MAX  (128)
+#define MI_SMALL_SIZE_MAX   (MI_SMALL_WSIZE_MAX*sizeof(void*))
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc(size_t size)       mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_reallocn(void* p, size_t count, size_t size)        mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export void* mi_reallocf(void* p, size_t newsize)                   mi_attr_noexcept mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export size_t mi_usable_size(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size)     mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Internals
+// ------------------------------------------------------
+
+typedef void (mi_cdecl mi_deferred_free_fun)(bool force, unsigned long long heartbeat, void* arg);
+mi_decl_export void mi_register_deferred_free(mi_deferred_free_fun* deferred_free, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_output_fun)(const char* msg, void* arg);
+mi_decl_export void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+typedef void (mi_cdecl mi_error_fun)(int err, void* arg);
+mi_decl_export void mi_register_error(mi_error_fun* fun, void* arg);
+
+mi_decl_export void mi_collect(bool force)    mi_attr_noexcept;
+mi_decl_export int  mi_version(void)          mi_attr_noexcept;
+mi_decl_export void mi_stats_reset(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_merge(void)      mi_attr_noexcept;
+mi_decl_export void mi_stats_print(void* out) mi_attr_noexcept;  // backward compatibility: `out` is ignored and should be NULL
+mi_decl_export void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_init(void)     mi_attr_noexcept;
+mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
+mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs,
+				    size_t* current_rss, size_t* peak_rss,
+				    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;
+
+// -------------------------------------------------------------------------------------
+// Aligned allocation
+// Note that `alignment` always follows `size` for consistency with unaligned
+// allocation, but unfortunately this differs from `posix_memalign` and `aligned_alloc`.
+// -------------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(1,2);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+
+
+// -------------------------------------------------------------------------------------
+// Heaps: first-class, but can only allocate from the same thread that created it.
+// -------------------------------------------------------------------------------------
+
+struct mi_heap_s;
+typedef struct mi_heap_s mi_heap_t;
+
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new(void);
+mi_decl_export void       mi_heap_delete(mi_heap_t* heap);
+mi_decl_export void       mi_heap_destroy(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_set_default(mi_heap_t* heap);
+mi_decl_export mi_heap_t* mi_heap_get_default(void);
+mi_decl_export mi_heap_t* mi_heap_get_backing(void);
+mi_decl_export void       mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, size_t count, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const char* s, size_t n) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept mi_attr_malloc;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned(mi_heap_t* heap, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_zalloc_aligned_at(mi_heap_t* heap, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, size_t count, size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_calloc_aligned_at(mi_heap_t* heap, size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size2(2, 3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_realloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+
+
+// --------------------------------------------------------------------------------
+// Zero initialized re-allocation.
+// Only valid on memory that was originally allocated with zero initialization too.
+// e.g. `mi_calloc`, `mi_zalloc`, `mi_zalloc_aligned` etc.
+// see <https://github.com/microsoft/mimalloc/issues/63#issuecomment-508272992>
+// --------------------------------------------------------------------------------
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc(void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc(void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(2) mi_attr_alloc_align(3);
+mi_decl_nodiscard mi_decl_export void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(2,3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc(mi_heap_t* heap, void* p, size_t newsize)                mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t newcount, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned(mi_heap_t* heap, void* p, size_t newsize, size_t alignment) mi_attr_noexcept mi_attr_alloc_size(3) mi_attr_alloc_align(4);
+mi_decl_nodiscard mi_decl_export void* mi_heap_rezalloc_aligned_at(mi_heap_t* heap, void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size(3);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept mi_attr_alloc_size2(3,4) mi_attr_alloc_align(5);
+mi_decl_nodiscard mi_decl_export void* mi_heap_recalloc_aligned_at(mi_heap_t* heap, void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept mi_attr_alloc_size2(3,4);
+
+
+// ------------------------------------------------------
+// Analysis
+// ------------------------------------------------------
+
+mi_decl_export bool mi_heap_contains_block(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_heap_check_owned(mi_heap_t* heap, const void* p);
+mi_decl_export bool mi_check_owned(const void* p);
+
+// An area of heap space contains blocks of a single size.
+typedef struct mi_heap_area_s {
+  void*  blocks;      // start of the area containing heap blocks
+  size_t reserved;    // bytes reserved for this area (virtual)
+  size_t committed;   // current available bytes for this area
+  size_t used;        // number of allocated blocks
+  size_t block_size;  // size in bytes of each block
+  size_t full_block_size; // size in bytes of a full block including padding and metadata.
+} mi_heap_area_t;
+
+typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg);
+
+mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg);
+
+// Experimental
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
+
+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+
+mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+
+mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
+
+// Experimental: heaps associated with specific memory arena's
+typedef int mi_arena_id_t;
+mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
+mi_decl_export int   mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export int   mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+mi_decl_export bool  mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
+
+#if MI_MALLOC_VERSION >= 200
+// Create a heap that only allocates in the specified arena
+mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id);
+#endif
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
+
+// ------------------------------------------------------
+// Convenience
+// ------------------------------------------------------
+
+#define mi_malloc_tp(tp)                ((tp*)mi_malloc(sizeof(tp)))
+#define mi_zalloc_tp(tp)                ((tp*)mi_zalloc(sizeof(tp)))
+#define mi_calloc_tp(tp,n)              ((tp*)mi_calloc(n,sizeof(tp)))
+#define mi_mallocn_tp(tp,n)             ((tp*)mi_mallocn(n,sizeof(tp)))
+#define mi_reallocn_tp(p,tp,n)          ((tp*)mi_reallocn(p,n,sizeof(tp)))
+#define mi_recalloc_tp(p,tp,n)          ((tp*)mi_recalloc(p,n,sizeof(tp)))
+
+#define mi_heap_malloc_tp(hp,tp)        ((tp*)mi_heap_malloc(hp,sizeof(tp)))
+#define mi_heap_zalloc_tp(hp,tp)        ((tp*)mi_heap_zalloc(hp,sizeof(tp)))
+#define mi_heap_calloc_tp(hp,tp,n)      ((tp*)mi_heap_calloc(hp,n,sizeof(tp)))
+#define mi_heap_mallocn_tp(hp,tp,n)     ((tp*)mi_heap_mallocn(hp,n,sizeof(tp)))
+#define mi_heap_reallocn_tp(hp,p,tp,n)  ((tp*)mi_heap_reallocn(hp,p,n,sizeof(tp)))
+#define mi_heap_recalloc_tp(hp,p,tp,n)  ((tp*)mi_heap_recalloc(hp,p,n,sizeof(tp)))
+
+
+// ------------------------------------------------------
+// Options
+// ------------------------------------------------------
+
+typedef enum mi_option_e {
+  // stable options
+  mi_option_show_errors,
+  mi_option_show_stats,
+  mi_option_verbose,
+  // some of the following options are experimental
+  // (deprecated options are kept for binary backward compatibility with v1.x versions)
+  mi_option_eager_commit,
+  mi_option_deprecated_eager_region_commit,
+  mi_option_deprecated_reset_decommits,
+  mi_option_large_os_pages,           // use large (2MiB) OS pages, implies eager commit
+  mi_option_reserve_huge_os_pages,    // reserve N huge OS pages (1GiB) at startup
+  mi_option_reserve_huge_os_pages_at, // reserve huge OS pages at a specific NUMA node
+  mi_option_reserve_os_memory,        // reserve specified amount of OS memory at startup
+  mi_option_deprecated_segment_cache,
+  mi_option_page_reset,
+  mi_option_abandoned_page_decommit,
+  mi_option_deprecated_segment_reset,
+  mi_option_eager_commit_delay,
+  mi_option_decommit_delay,
+  mi_option_use_numa_nodes,           // 0 = use available numa nodes, otherwise use at most N nodes.
+  mi_option_limit_os_alloc,           // 1 = do not use OS memory for allocation (but only reserved arenas)
+  mi_option_os_tag,
+  mi_option_max_errors,
+  mi_option_max_warnings,
+  mi_option_max_segment_reclaim,
+  mi_option_allow_decommit,
+  mi_option_segment_decommit_delay,
+  mi_option_decommit_extend_delay,
+  mi_option_destroy_on_exit,
+  _mi_option_last
+} mi_option_t;
+
+
+mi_decl_nodiscard mi_decl_export bool mi_option_is_enabled(mi_option_t option);
+mi_decl_export void mi_option_enable(mi_option_t option);
+mi_decl_export void mi_option_disable(mi_option_t option);
+mi_decl_export void mi_option_set_enabled(mi_option_t option, bool enable);
+mi_decl_export void mi_option_set_enabled_default(mi_option_t option, bool enable);
+
+mi_decl_nodiscard mi_decl_export long mi_option_get(mi_option_t option);
+mi_decl_nodiscard mi_decl_export long mi_option_get_clamp(mi_option_t option, long min, long max);
+mi_decl_export void mi_option_set(mi_option_t option, long value);
+mi_decl_export void mi_option_set_default(mi_option_t option, long value);
+
+
+// -------------------------------------------------------------------------------------------------------
+// "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions.
+// (This can be convenient when providing overrides of these functions as done in `mimalloc-override.h`.)
+// note: we use `mi_cfree` as "checked free" and it checks if the pointer is in our heap before free-ing.
+// -------------------------------------------------------------------------------------------------------
+
+mi_decl_export void  mi_cfree(void* p) mi_attr_noexcept;
+mi_decl_export void* mi__expand(void* p, size_t newsize) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_size(const void* p)        mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_good_size(size_t size)     mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept;
+
+mi_decl_export int mi_posix_memalign(void** p, size_t alignment, size_t size)   mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_memalign(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_valloc(size_t size)  mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_pvalloc(size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_aligned_alloc(size_t alignment, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2) mi_attr_alloc_align(1);
+
+mi_decl_nodiscard mi_decl_export void* mi_reallocarray(void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(2,3);
+mi_decl_nodiscard mi_decl_export int   mi_reallocarr(void* p, size_t count, size_t size) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_recalloc(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export void* mi_aligned_offset_recalloc(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept;
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned short* mi_wcsdup(const unsigned short* s) mi_attr_noexcept mi_attr_malloc;
+mi_decl_nodiscard mi_decl_export mi_decl_restrict unsigned char*  mi_mbsdup(const unsigned char* s)  mi_attr_noexcept mi_attr_malloc;
+mi_decl_export int mi_dupenv_s(char** buf, size_t* size, const char* name)                      mi_attr_noexcept;
+mi_decl_export int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name) mi_attr_noexcept;
+
+mi_decl_export void mi_free_size(void* p, size_t size)                           mi_attr_noexcept;
+mi_decl_export void mi_free_size_aligned(void* p, size_t size, size_t alignment) mi_attr_noexcept;
+mi_decl_export void mi_free_aligned(void* p, size_t alignment)                   mi_attr_noexcept;
+
+// The `mi_new` wrappers implement C++ semantics on out-of-memory instead of directly returning `NULL`.
+// (and call `std::get_new_handler` and potentially raise a `std::bad_alloc` exception).
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new(size_t size)                   mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned(size_t size, size_t alignment) mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_nothrow(size_t size)           mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_aligned_nothrow(size_t size, size_t alignment) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(1) mi_attr_alloc_align(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_new_n(size_t count, size_t size)   mi_attr_malloc mi_attr_alloc_size2(1, 2);
+mi_decl_nodiscard mi_decl_export void* mi_new_realloc(void* p, size_t newsize)                mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export void* mi_new_reallocn(void* p, size_t newcount, size_t size) mi_attr_alloc_size2(2, 3);
+
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size_t size)                mi_attr_malloc mi_attr_alloc_size(2);
+mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, size_t count, size_t size) mi_attr_malloc mi_attr_alloc_size2(2, 3);
+
+#ifdef __cplusplus
+}
+#endif
+
+// ---------------------------------------------------------------------------------------------
+// Implement the C++ std::allocator interface for use in STL containers.
+// (note: see `mimalloc-new-delete.h` for overriding the new/delete operators globally)
+// ---------------------------------------------------------------------------------------------
+#ifdef __cplusplus
+
+#include <cstddef>     // std::size_t
+#include <cstdint>     // PTRDIFF_MAX
+#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#include <type_traits> // std::true_type
+#include <utility>     // std::forward
+#endif
+
+template<class T> struct _mi_stl_allocator_common {
+  typedef T                 value_type;
+  typedef std::size_t       size_type;
+  typedef std::ptrdiff_t    difference_type;
+  typedef value_type&       reference;
+  typedef value_type const& const_reference;
+  typedef value_type*       pointer;
+  typedef value_type const* const_pointer;
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap            = std::true_type;
+  template <class U, class ...Args> void construct(U* p, Args&& ...args) { ::new(p) U(std::forward<Args>(args)...); }
+  template <class U> void destroy(U* p) mi_attr_noexcept { p->~U(); }
+  #else
+  void construct(pointer p, value_type const& val) { ::new(p) value_type(val); }
+  void destroy(pointer p) { p->~value_type(); }
+  #endif
+
+  size_type     max_size() const mi_attr_noexcept { return (PTRDIFF_MAX/sizeof(value_type)); }
+  pointer       address(reference x) const        { return &x; }
+  const_pointer address(const_reference x) const  { return &x; }
+};
+
+template<class T> struct mi_stl_allocator : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+  template <class U> struct rebind { typedef mi_stl_allocator<U> other; };
+
+  mi_stl_allocator()                                             mi_attr_noexcept = default;
+  mi_stl_allocator(const mi_stl_allocator&)                      mi_attr_noexcept = default;
+  template<class U> mi_stl_allocator(const mi_stl_allocator<U>&) mi_attr_noexcept { }
+  mi_stl_allocator  select_on_container_copy_construction() const { return *this; }
+  void              deallocate(T* p, size_type) { mi_free(p); }
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_new_n(count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_new_n(count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::true_type;
+  #endif
+};
+
+template<class T1,class T2> bool operator==(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return true; }
+template<class T1,class T2> bool operator!=(const mi_stl_allocator<T1>& , const mi_stl_allocator<T2>& ) mi_attr_noexcept { return false; }
+
+
+#if (__cplusplus >= 201103L) || (_MSC_VER > 1900)  // C++11
+#include <memory>      // std::shared_ptr
+
+// Common base class for STL allocators in a specific heap
+template<class T, bool destroy> struct _mi_heap_stl_allocator_common : public _mi_stl_allocator_common<T> {
+  using typename _mi_stl_allocator_common<T>::size_type;
+  using typename _mi_stl_allocator_common<T>::value_type;
+  using typename _mi_stl_allocator_common<T>::pointer;
+
+  _mi_heap_stl_allocator_common(mi_heap_t* hp) : heap(hp) { }    /* will not delete nor destroy the passed in heap */
+
+  #if (__cplusplus >= 201703L)  // C++17
+  mi_decl_nodiscard T* allocate(size_type count) { return static_cast<T*>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(T))); }
+  mi_decl_nodiscard T* allocate(size_type count, const void*) { return allocate(count); }
+  #else
+  mi_decl_nodiscard pointer allocate(size_type count, const void* = 0) { return static_cast<pointer>(mi_heap_alloc_new_n(this->heap.get(), count, sizeof(value_type))); }
+  #endif
+
+  #if ((__cplusplus >= 201103L) || (_MSC_VER > 1900))  // C++11
+  using is_always_equal = std::false_type;
+  #endif
+
+  void collect(bool force) { mi_heap_collect(this->heap.get(), force); }
+  template<class U> bool is_equal(const _mi_heap_stl_allocator_common<U, destroy>& x) const { return (this->heap == x.heap); }
+
+protected:
+  std::shared_ptr<mi_heap_t> heap;
+  template<class U, bool D> friend struct _mi_heap_stl_allocator_common;
+
+  _mi_heap_stl_allocator_common() {
+    mi_heap_t* hp = mi_heap_new();
+    this->heap.reset(hp, (destroy ? &heap_destroy : &heap_delete));  /* calls heap_delete/destroy when the refcount drops to zero */
+  }
+  _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common& x) mi_attr_noexcept : heap(x.heap) { }
+  template<class U> _mi_heap_stl_allocator_common(const _mi_heap_stl_allocator_common<U, destroy>& x) mi_attr_noexcept : heap(x.heap) { }
+
+private:
+  static void heap_delete(mi_heap_t* hp)  { if (hp != NULL) { mi_heap_delete(hp); } }
+  static void heap_destroy(mi_heap_t* hp) { if (hp != NULL) { mi_heap_destroy(hp); } }
+};
+
+// STL allocator allocation in a specific heap
+template<class T> struct mi_heap_stl_allocator : public _mi_heap_stl_allocator_common<T, false> {
+  using typename _mi_heap_stl_allocator_common<T, false>::size_type;
+  mi_heap_stl_allocator() : _mi_heap_stl_allocator_common<T, false>() { } // creates fresh heap that is deleted when the destructor is called
+  mi_heap_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, false>(hp) { }  // no delete nor destroy on the passed in heap
+  template<class U> mi_heap_stl_allocator(const mi_heap_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, false>(x) { }
+
+  mi_heap_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T* p, size_type) { mi_free(p); }
+  template<class U> struct rebind { typedef mi_heap_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_stl_allocator<T1>& x, const mi_heap_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+
+// STL allocator allocation in a specific heap, where `free` does nothing and
+// the heap is destroyed in one go on destruction -- use with care!
+template<class T> struct mi_heap_destroy_stl_allocator : public _mi_heap_stl_allocator_common<T, true> {
+  using typename _mi_heap_stl_allocator_common<T, true>::size_type;
+  mi_heap_destroy_stl_allocator() : _mi_heap_stl_allocator_common<T, true>() { } // creates fresh heap that is destroyed when the destructor is called
+  mi_heap_destroy_stl_allocator(mi_heap_t* hp) : _mi_heap_stl_allocator_common<T, true>(hp) { }  // no delete nor destroy on the passed in heap
+  template<class U> mi_heap_destroy_stl_allocator(const mi_heap_destroy_stl_allocator<U>& x) mi_attr_noexcept : _mi_heap_stl_allocator_common<T, true>(x) { }
+
+  mi_heap_destroy_stl_allocator select_on_container_copy_construction() const { return *this; }
+  void deallocate(T*, size_type) { /* do nothing as we destroy the heap on destruct. */ }
+  template<class U> struct rebind { typedef mi_heap_destroy_stl_allocator<U> other; };
+};
+
+template<class T1, class T2> bool operator==(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (x.is_equal(y)); }
+template<class T1, class T2> bool operator!=(const mi_heap_destroy_stl_allocator<T1>& x, const mi_heap_destroy_stl_allocator<T2>& y) mi_attr_noexcept { return (!x.is_equal(y)); }
+
+#endif // C++11
+
+#endif // __cplusplus
+
+#endif
diff --git a/compat/mimalloc/options.c b/compat/mimalloc/options.c
new file mode 100644
index 0000000000..ebb227da14
--- /dev/null
+++ b/compat/mimalloc/options.c
@@ -0,0 +1,642 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>
+#include <stdlib.h> // strtol
+#include <string.h> // strncpy, strncat, strlen, strstr
+#include <ctype.h>  // toupper
+#include <stdarg.h>
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)   // strncpy, strncat
+#endif
+
+
+static long mi_max_error_count   = 16; // stop outputting errors after this (use < 0 for no limit)
+static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit)
+
+static void mi_add_stderr_output(void);
+
+int mi_version(void) mi_attr_noexcept {
+  return MI_MALLOC_VERSION;
+}
+
+#ifdef _WIN32
+#include <conio.h>
+#endif
+
+// --------------------------------------------------------
+// Options
+// These can be accessed by multiple threads and may be
+// concurrently initialized, but an initializing data race
+// is ok since they resolve to the same value.
+// --------------------------------------------------------
+typedef enum mi_init_e {
+  UNINIT,       // not yet initialized
+  DEFAULTED,    // not found in the environment, use default value
+  INITIALIZED   // found in environment or set explicitly
+} mi_init_t;
+
+typedef struct mi_option_desc_s {
+  long        value;  // the value
+  mi_init_t   init;   // is it initialized yet? (from the environment)
+  mi_option_t option; // for debugging: the option index should match the option
+  const char* name;   // option name without `mimalloc_` prefix
+  const char* legacy_name; // potential legacy v1.x option name
+} mi_option_desc_t;
+
+#define MI_OPTION(opt)                  mi_option_##opt, #opt, NULL
+#define MI_OPTION_LEGACY(opt,legacy)    mi_option_##opt, #opt, #legacy
+
+static mi_option_desc_t options[_mi_option_last] =
+{
+  // stable options
+  #if MI_DEBUG || defined(MI_SHOW_ERRORS)
+  { 1, UNINIT, MI_OPTION(show_errors) },
+  #else
+  { 0, UNINIT, MI_OPTION(show_errors) },
+  #endif
+  { 0, UNINIT, MI_OPTION(show_stats) },
+  { 0, UNINIT, MI_OPTION(verbose) },
+
+  // Some of the following options are experimental and not all combinations are valid. Use with care.
+  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (8MiB)  (but see also `eager_commit_delay`)
+  { 0, UNINIT, MI_OPTION(deprecated_eager_region_commit) },
+  { 0, UNINIT, MI_OPTION(deprecated_reset_decommits) },
+  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
+  { -1, UNINIT, MI_OPTION(reserve_huge_os_pages_at) }, // reserve huge pages at node N
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
+  { 0, UNINIT, MI_OPTION(deprecated_segment_cache) },  // cache N segments per thread
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
+  { 0, UNINIT, MI_OPTION_LEGACY(abandoned_page_decommit, abandoned_page_reset) },// decommit free page memory when a thread terminates
+  { 0, UNINIT, MI_OPTION(deprecated_segment_reset) },
+  #if defined(__NetBSD__)
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  #elif defined(_WIN32)
+  { 4, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  #else
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
+  #endif
+  { 25,   UNINIT, MI_OPTION_LEGACY(decommit_delay, reset_delay) }, // page decommit delay in milli-seconds
+  { 0,    UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,    UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
+  { 100,  UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
+  { 16,   UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
+  { 16,   UNINIT, MI_OPTION(max_warnings) },      // maximum warnings that are output
+  { 8,    UNINIT, MI_OPTION(max_segment_reclaim)},// max. number of segment reclaims from the abandoned segments per try.
+  { 1,    UNINIT, MI_OPTION(allow_decommit) },    // decommit slices when no longer used (after decommit_delay milli-seconds)
+  { 500,  UNINIT, MI_OPTION(segment_decommit_delay) }, // decommit delay in milli-seconds for freed segments
+  { 1,    UNINIT, MI_OPTION(decommit_extend_delay) },
+  { 0,    UNINIT, MI_OPTION(destroy_on_exit)}     // release all OS memory on process exit; careful with dangling pointer or after-exit frees!
+};
+
+static void mi_option_init(mi_option_desc_t* desc);
+
+void _mi_options_init(void) {
+  // called on process load; should not be called before the CRT is initialized!
+  // (e.g. do not call this from process_init as that may run before CRT initialization)
+  mi_add_stderr_output(); // now it safe to use stderr for output
+  for(int i = 0; i < _mi_option_last; i++ ) {
+    mi_option_t option = (mi_option_t)i;
+    long l = mi_option_get(option); MI_UNUSED(l); // initialize
+    // if (option != mi_option_verbose)
+    {
+      mi_option_desc_t* desc = &options[option];
+      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
+    }
+  }
+  mi_max_error_count = mi_option_get(mi_option_max_errors);
+  mi_max_warning_count = mi_option_get(mi_option_max_warnings);
+}
+
+mi_decl_nodiscard long mi_option_get(mi_option_t option) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return 0;
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  if mi_unlikely(desc->init == UNINIT) {
+    mi_option_init(desc);
+  }
+  return desc->value;
+}
+
+mi_decl_nodiscard long mi_option_get_clamp(mi_option_t option, long min, long max) {
+  long x = mi_option_get(option);
+  return (x < min ? min : (x > max ? max : x));
+}
+
+void mi_option_set(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
+  mi_option_desc_t* desc = &options[option];
+  mi_assert(desc->option == option);  // index should match the option
+  desc->value = value;
+  desc->init = INITIALIZED;
+}
+
+void mi_option_set_default(mi_option_t option, long value) {
+  mi_assert(option >= 0 && option < _mi_option_last);
+  if (option < 0 || option >= _mi_option_last) return;
+  mi_option_desc_t* desc = &options[option];
+  if (desc->init != INITIALIZED) {
+    desc->value = value;
+  }
+}
+
+mi_decl_nodiscard bool mi_option_is_enabled(mi_option_t option) {
+  return (mi_option_get(option) != 0);
+}
+
+void mi_option_set_enabled(mi_option_t option, bool enable) {
+  mi_option_set(option, (enable ? 1 : 0));
+}
+
+void mi_option_set_enabled_default(mi_option_t option, bool enable) {
+  mi_option_set_default(option, (enable ? 1 : 0));
+}
+
+void mi_option_enable(mi_option_t option) {
+  mi_option_set_enabled(option,true);
+}
+
+void mi_option_disable(mi_option_t option) {
+  mi_option_set_enabled(option,false);
+}
+
+
+static void mi_cdecl mi_out_stderr(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg == NULL) return;
+  #ifdef _WIN32
+  // on windows with redirection, the C runtime cannot handle locale dependent output
+  // after the main thread closes so we use direct console output.
+  if (!_mi_preloading()) {
+    // _cputs(msg);  // _cputs cannot be used at is aborts if it fails to lock the console
+    static HANDLE hcon = INVALID_HANDLE_VALUE;
+    static bool hconIsConsole;
+    if (hcon == INVALID_HANDLE_VALUE) {
+      CONSOLE_SCREEN_BUFFER_INFO sbi;
+      hcon = GetStdHandle(STD_ERROR_HANDLE);
+      hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi));
+    }
+    const size_t len = strlen(msg);
+    if (len > 0 && len < UINT32_MAX) {
+      DWORD written = 0;
+      if (hconIsConsole) {
+	WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else if (hcon != INVALID_HANDLE_VALUE) {
+	// use direct write if stderr was redirected
+	WriteFile(hcon, msg, (DWORD)len, &written, NULL);
+      }
+      else {
+	// finally fall back to fputs after all
+	fputs(msg, stderr);
+      }
+    }
+  }
+  #else
+  fputs(msg, stderr);
+  #endif
+}
+
+// Since an output function can be registered earliest in the `main`
+// function we also buffer output that happens earlier. When
+// an output function is registered it is called immediately with
+// the output up to that point.
+#ifndef MI_MAX_DELAY_OUTPUT
+#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024))
+#endif
+static char out_buf[MI_MAX_DELAY_OUTPUT+1];
+static _Atomic(size_t) out_len;
+
+static void mi_cdecl mi_out_buf(const char* msg, void* arg) {
+  MI_UNUSED(arg);
+  if (msg==NULL) return;
+  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  size_t n = strlen(msg);
+  if (n==0) return;
+  // claim space
+  size_t start = mi_atomic_add_acq_rel(&out_len, n);
+  if (start >= MI_MAX_DELAY_OUTPUT) return;
+  // check bound
+  if (start+n >= MI_MAX_DELAY_OUTPUT) {
+    n = MI_MAX_DELAY_OUTPUT-start-1;
+  }
+  _mi_memcpy(&out_buf[start], msg, n);
+}
+
+static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
+  if (out==NULL) return;
+  // claim (if `no_more_buf == true`, no more output will be added after this point)
+  size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  // and output the current contents
+  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
+  out_buf[count] = 0;
+  out(out_buf,arg);
+  if (!no_more_buf) {
+    out_buf[count] = '\n'; // if continue with the buffer, insert a newline
+  }
+}
+
+
+// Once this module is loaded, switch to this routine
+// which outputs to stderr and the delayed output buffer.
+static void mi_cdecl mi_out_buf_stderr(const char* msg, void* arg) {
+  mi_out_stderr(msg,arg);
+  mi_out_buf(msg,arg);
+}
+
+
+
+// --------------------------------------------------------
+// Default output handler
+// --------------------------------------------------------
+
+// Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t.
+// For now, don't register output from multiple threads.
+static mi_output_fun* volatile mi_out_default; // = NULL
+static _Atomic(void*) mi_out_arg; // = NULL
+
+static mi_output_fun* mi_out_get_default(void** parg) {
+  if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); }
+  mi_output_fun* out = mi_out_default;
+  return (out == NULL ? &mi_out_buf : out);
+}
+
+void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
+  mi_atomic_store_ptr_release(void,&mi_out_arg, arg);
+  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
+}
+
+// add stderr to the delayed output after the module is loaded
+static void mi_add_stderr_output() {
+  mi_assert_internal(mi_out_default == NULL);
+  mi_out_buf_flush(&mi_out_stderr, false, NULL); // flush current contents to stderr
+  mi_out_default = &mi_out_buf_stderr;           // and add stderr to the delayed output
+}
+
+// --------------------------------------------------------
+// Messages, all end up calling `_mi_fputs`.
+// --------------------------------------------------------
+static _Atomic(size_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
+static _Atomic(size_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings
+
+// When overriding malloc, we may recurse into mi_vfprintf if an allocation
+// inside the C runtime causes another message.
+// In some cases (like on macOS) the loader already allocates which
+// calls into mimalloc; if we then access thread locals (like `recurse`)
+// this may crash as the access may call _tlv_bootstrap that tries to
+// (recursively) invoke malloc again to allocate space for the thread local
+// variables on demand. This is why we use a _mi_preloading test on such
+// platforms. However, C code generator may move the initial thread local address
+// load before the `if` and we therefore split it out in a separate funcion.
+static mi_decl_thread bool recurse = false;
+
+static mi_decl_noinline bool mi_recurse_enter_prim(void) {
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static mi_decl_noinline void mi_recurse_exit_prim(void) {
+  recurse = false;
+}
+
+static bool mi_recurse_enter(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return true;
+  #endif
+  return mi_recurse_enter_prim();
+}
+
+static void mi_recurse_exit(void) {
+  #if defined(__APPLE__) || defined(MI_TLS_RECURSE_GUARD)
+  if (_mi_preloading()) return;
+  #endif
+  mi_recurse_exit_prim();
+}
+
+void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
+  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
+    if (!mi_recurse_enter()) return;
+    out = mi_out_get_default(&arg);
+    if (prefix != NULL) out(prefix, arg);
+    out(message, arg);
+    mi_recurse_exit();
+  }
+  else {
+    if (prefix != NULL) out(prefix, arg);
+    out(message, arg);
+  }
+}
+
+// Define our own limited `fprintf` that avoids memory allocation.
+// We do this using `snprintf` with a limited buffer.
+static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
+  char buf[512];
+  if (fmt==NULL) return;
+  if (!mi_recurse_enter()) return;
+  vsnprintf(buf,sizeof(buf)-1,fmt,args);
+  mi_recurse_exit();
+  _mi_fputs(out,arg,prefix,buf);
+}
+
+void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(out,arg,NULL,fmt,args);
+  va_end(args);
+}
+
+static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) {
+  if (prefix != NULL && strlen(prefix) <= 32 && !_mi_is_main_thread()) {
+    char tprefix[64];
+    snprintf(tprefix, sizeof(tprefix), "%sthread 0x%llx: ", prefix, (unsigned long long)_mi_thread_id());
+    mi_vfprintf(out, arg, tprefix, fmt, args);
+  }
+  else {
+    mi_vfprintf(out, arg, prefix, fmt, args);
+  }
+}
+
+void _mi_trace_message(const char* fmt, ...) {
+  if (mi_option_get(mi_option_verbose) <= 1) return;  // only with verbose level 2 or higher
+  va_list args;
+  va_start(args, fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+void _mi_verbose_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) return;
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf(NULL, NULL, "mimalloc: ", fmt, args);
+  va_end(args);
+}
+
+static void mi_show_error_message(const char* fmt, va_list args) {
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_error_count >= 0 && (long)mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
+  }
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: error: ", fmt, args);
+}
+
+void _mi_warning_message(const char* fmt, ...) {
+  if (!mi_option_is_enabled(mi_option_verbose)) {
+    if (!mi_option_is_enabled(mi_option_show_errors)) return;
+    if (mi_max_warning_count >= 0 && (long)mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return;
+  }
+  va_list args;
+  va_start(args,fmt);
+  mi_vfprintf_thread(NULL, NULL, "mimalloc: warning: ", fmt, args);
+  va_end(args);
+}
+
+
+#if MI_DEBUG
+void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, const char* func ) {
+  _mi_fprintf(NULL, NULL, "mimalloc: assertion failed: at \"%s\":%u, %s\n  assertion: \"%s\"\n", fname, line, (func==NULL?"":func), assertion);
+  abort();
+}
+#endif
+
+// --------------------------------------------------------
+// Errors
+// --------------------------------------------------------
+
+static mi_error_fun* volatile  mi_error_handler; // = NULL
+static _Atomic(void*) mi_error_arg;     // = NULL
+
+static void mi_error_default(int err) {
+  MI_UNUSED(err);
+#if (MI_DEBUG>0)
+  if (err==EFAULT) {
+    #ifdef _MSC_VER
+    __debugbreak();
+    #endif
+    abort();
+  }
+#endif
+#if (MI_SECURE>0)
+  if (err==EFAULT) {  // abort on serious errors in secure mode (corrupted meta-data)
+    abort();
+  }
+#endif
+#if defined(MI_XMALLOC)
+  if (err==ENOMEM || err==EOVERFLOW) { // abort on memory allocation fails in xmalloc mode
+    abort();
+  }
+#endif
+}
+
+void mi_register_error(mi_error_fun* fun, void* arg) {
+  mi_error_handler = fun;  // can be NULL
+  mi_atomic_store_ptr_release(void,&mi_error_arg, arg);
+}
+
+void _mi_error_message(int err, const char* fmt, ...) {
+  // show detailed error message
+  va_list args;
+  va_start(args, fmt);
+  mi_show_error_message(fmt, args);
+  va_end(args);
+  // and call the error handler which may abort (or return normally)
+  if (mi_error_handler != NULL) {
+    mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg));
+  }
+  else {
+    mi_error_default(err);
+  }
+}
+
+// --------------------------------------------------------
+// Initialize options by checking the environment
+// --------------------------------------------------------
+
+static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // copy until end of src, or when dest is (almost) full
+  while (*src != 0 && dest_size > 1) {
+    *dest++ = *src++;
+    dest_size--;
+  }
+  // always zero terminate
+  *dest = 0;
+}
+
+static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
+  if (dest==NULL || src==NULL || dest_size == 0) return;
+  // find end of string in the dest buffer
+  while (*dest != 0 && dest_size > 1) {
+    dest++;
+    dest_size--;
+  }
+  // and catenate
+  mi_strlcpy(dest, src, dest_size);
+}
+
+#ifdef MI_NO_GETENV
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  MI_UNUSED(name);
+  MI_UNUSED(result);
+  MI_UNUSED(result_size);
+  return false;
+}
+#else
+#if defined _WIN32
+// On Windows use GetEnvironmentVariable instead of getenv to work
+// reliably even when this is invoked before the C runtime is initialized.
+// i.e. when `_mi_preloading() == true`.
+// Note: on windows, environment names are not case sensitive.
+#include <windows.h>
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  result[0] = 0;
+  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
+  return (len > 0 && len < result_size);
+}
+#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to acces environment variables
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
+static int mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n == 0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (toupper(*s) != toupper(*t)) break;
+  }
+  return (n == 0 ? 0 : *s - *t);
+}
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;
+  const size_t len = strlen(name);
+  if (len == 0) return false;
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 256 entries
+  for (int i = 0; i < 256 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
+  const char* s = getenv(name);
+  if (s == NULL) {
+    // we check the upper case name too.
+    char buf[64+1];
+    size_t len = strlen(name);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = toupper(name[i]);
+    }
+    buf[len] = 0;
+    s = getenv(buf);
+  }
+  if (s != NULL && strlen(s) < result_size) {
+    mi_strlcpy(result, s, result_size);
+    return true;
+  }
+  else {
+    return false;
+  }
+}
+#endif  // !MI_USE_ENVIRON
+#endif  // !MI_NO_GETENV
+
+static void mi_option_init(mi_option_desc_t* desc) {
+  // Read option value from the environment
+  char s[64+1];
+  char buf[64+1];
+  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+  mi_strlcat(buf, desc->name, sizeof(buf));
+  bool found = mi_getenv(buf,s,sizeof(s));
+  if (!found && desc->legacy_name != NULL) {
+    mi_strlcpy(buf, "mimalloc_", sizeof(buf));
+    mi_strlcat(buf, desc->legacy_name, sizeof(buf));
+    found = mi_getenv(buf,s,sizeof(s));
+    if (found) {
+      _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name );
+    }
+  }
+
+  if (found) {
+    size_t len = strlen(s);
+    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
+    for (size_t i = 0; i < len; i++) {
+      buf[i] = (char)toupper(s[i]);
+    }
+    buf[len] = 0;
+    if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) {
+      desc->value = 1;
+      desc->init = INITIALIZED;
+    }
+    else if (strstr("0;FALSE;NO;OFF", buf) != NULL) {
+      desc->value = 0;
+      desc->init = INITIALIZED;
+    }
+    else {
+      char* end = buf;
+      long value = strtol(buf, &end, 10);
+      if (desc->option == mi_option_reserve_os_memory) {
+	// this option is interpreted in KiB to prevent overflow of `long`
+	if (*end == 'K') { end++; }
+	else if (*end == 'M') { value *= MI_KiB; end++; }
+	else if (*end == 'G') { value *= MI_MiB; end++; }
+	else { value = (value + MI_KiB - 1) / MI_KiB; }
+	if (end[0] == 'I' && end[1] == 'B') { end += 2; }
+	else if (*end == 'B') { end++; }
+      }
+      if (*end == 0) {
+	desc->value = value;
+	desc->init = INITIALIZED;
+      }
+      else {
+	// set `init` first to avoid recursion through _mi_warning_message on mimalloc_verbose.
+	desc->init = DEFAULTED;
+	if (desc->option == mi_option_verbose && desc->value == 0) {
+	  // if the 'mimalloc_verbose' env var has a bogus value we'd never know
+	  // (since the value defaults to 'off') so in that case briefly enable verbose
+	  desc->value = 1;
+	  _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+	  desc->value = 0;
+	}
+	else {
+	  _mi_warning_message("environment option mimalloc_%s has an invalid value.\n", desc->name );
+	}
+      }
+    }
+    mi_assert_internal(desc->init != UNINIT);
+  }
+  else if (!_mi_preloading()) {
+    desc->init = DEFAULTED;
+  }
+}
diff --git a/compat/mimalloc/os.c b/compat/mimalloc/os.c
new file mode 100644
index 0000000000..3503f8b28f
--- /dev/null
+++ b/compat/mimalloc/os.c
@@ -0,0 +1,1479 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // ensure mmap flags are defined
+#endif
+
+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // strerror
+
+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // strerror
+#endif
+
+#if defined(__wasi__)
+#define MI_USE_SBRK
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__wasi__)
+#include <unistd.h>    // sbrk
+#else
+#include <sys/mman.h>  // mmap
+#include <unistd.h>    // sysconf
+#if defined(__linux__)
+#include <features.h>
+#include <fcntl.h>
+#if defined(__GLIBC__)
+#include <linux/mman.h> // linux mmap flags
+#else
+#include <sys/mman.h>
+#endif
+#endif
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
+#include <mach/vm_statistics.h>
+#endif
+#endif
+#if defined(__FreeBSD__) || defined(__DragonFly__)
+#include <sys/param.h>
+#if __FreeBSD_version >= 1200000
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
+#endif
+#include <sys/sysctl.h>
+#endif
+#endif
+
+/* -----------------------------------------------------------
+  Initialization.
+  On windows initializes support for aligned allocation and
+  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
+----------------------------------------------------------- */
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats);
+
+static void* mi_align_up_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_up((uintptr_t)p, alignment);
+}
+
+static void* mi_align_down_ptr(void* p, size_t alignment) {
+  return (void*)_mi_align_down((uintptr_t)p, alignment);
+}
+
+
+// page size (initialized properly in `os_init`)
+static size_t os_page_size = 4096;
+
+// minimal allocation granularity
+static size_t os_alloc_granularity = 4096;
+
+// if non-zero, use large page allocation
+static size_t large_os_page_size = 0;
+
+// is memory overcommit allowed?
+// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE)
+static bool os_overcommit = true;
+
+bool _mi_os_has_overcommit(void) {
+  return os_overcommit;
+}
+
+// OS (small) page size
+size_t _mi_os_page_size(void) {
+  return os_page_size;
+}
+
+// if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB)
+size_t _mi_os_large_page_size(void) {
+  return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size());
+}
+
+#if !defined(MI_USE_SBRK) && !defined(__wasi__)
+static bool use_large_os_page(size_t size, size_t alignment) {
+  // if we have access, check the size and alignment requirements
+  if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false;
+  return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0);
+}
+#endif
+
+// round to a good OS allocation size (bounded by max 12.5% waste)
+size_t _mi_os_good_alloc_size(size_t size) {
+  size_t align_size;
+  if (size < 512*MI_KiB) align_size = _mi_os_page_size();
+  else if (size < 2*MI_MiB) align_size = 64*MI_KiB;
+  else if (size < 8*MI_MiB) align_size = 256*MI_KiB;
+  else if (size < 32*MI_MiB) align_size = 1*MI_MiB;
+  else align_size = 4*MI_MiB;
+  if mi_unlikely(size >= (SIZE_MAX - align_size)) return size; // possible overflow?
+  return _mi_align_up(size, align_size);
+}
+
+#if defined(_WIN32)
+// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
+// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
+// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's.
+typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E {
+  MiMemExtendedParameterInvalidType = 0,
+  MiMemExtendedParameterAddressRequirements,
+  MiMemExtendedParameterNumaNode,
+  MiMemExtendedParameterPartitionHandle,
+  MiMemExtendedParameterUserPhysicalHandle,
+  MiMemExtendedParameterAttributeFlags,
+  MiMemExtendedParameterMax
+} MI_MEM_EXTENDED_PARAMETER_TYPE;
+
+typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S {
+  struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type;
+  union  { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg;
+} MI_MEM_EXTENDED_PARAMETER;
+
+typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S {
+  PVOID  LowestStartingAddress;
+  PVOID  HighestEndingAddress;
+  SIZE_T Alignment;
+} MI_MEM_ADDRESS_REQUIREMENTS;
+
+#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE   0x00000010
+
+#include <winternl.h>
+typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG);
+static PVirtualAlloc2 pVirtualAlloc2 = NULL;
+static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
+
+// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER;
+
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber);
+static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
+static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
+static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+static PGetNumaProcessorNode        pGetNumaProcessorNode = NULL;
+
+static bool mi_win_enable_large_os_pages(void)
+{
+  if (large_os_page_size > 0) return true;
+
+  // Try to see if large OS pages are supported
+  // To use large pages on Windows, we first need access permission
+  // Set "Lock pages in memory" permission in the group policy editor
+  // <https://devblogs.microsoft.com/oldnewthing/20110128-00/?p=11643>
+  unsigned long err = 0;
+  HANDLE token = NULL;
+  BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+  if (ok) {
+    TOKEN_PRIVILEGES tp;
+    ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid);
+    if (ok) {
+      tp.PrivilegeCount = 1;
+      tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+      ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);
+      if (ok) {
+	err = GetLastError();
+	ok = (err == ERROR_SUCCESS);
+	if (ok) {
+	  large_os_page_size = GetLargePageMinimum();
+	}
+      }
+    }
+    CloseHandle(token);
+  }
+  if (!ok) {
+    if (err == 0) err = GetLastError();
+    _mi_warning_message("cannot enable large OS page support, error %lu\n", err);
+  }
+  return (ok!=0);
+}
+
+void _mi_os_init(void)
+{
+  os_overcommit = false;
+  // get the page size
+  SYSTEM_INFO si;
+  GetSystemInfo(&si);
+  if (si.dwPageSize > 0) os_page_size = si.dwPageSize;
+  if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity;
+  // get the VirtualAlloc2 function
+  HINSTANCE  hDll;
+  hDll = LoadLibrary(TEXT("kernelbase.dll"));
+  if (hDll != NULL) {
+    // use VirtualAlloc2FromApp if possible as it is available to Windows store apps
+    pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp");
+    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
+    FreeLibrary(hDll);
+  }
+  // NtAllocateVirtualMemoryEx is used for huge page allocation
+  hDll = LoadLibrary(TEXT("ntdll.dll"));
+  if (hDll != NULL) {
+    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
+    FreeLibrary(hDll);
+  }
+  // Try to use Win7+ numa API
+  hDll = LoadLibrary(TEXT("kernel32.dll"));
+  if (hDll != NULL) {
+    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
+    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
+    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode");
+    FreeLibrary(hDll);
+  }
+  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    mi_win_enable_large_os_pages();
+  }
+}
+#elif defined(__wasi__)
+void _mi_os_init(void) {
+  os_overcommit = false;
+  os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB
+  os_alloc_granularity = 16;
+}
+
+#else  // generic unix
+
+static void os_detect_overcommit(void) {
+#if defined(__linux__)
+  int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+	if (fd < 0) return;
+  char buf[32];
+  ssize_t nread = read(fd, &buf, sizeof(buf));
+	close(fd);
+  // <https://www.kernel.org/doc/Documentation/vm/overcommit-accounting>
+  // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE)
+  if (nread >= 1) {
+    os_overcommit = (buf[0] == '0' || buf[0] == '1');
+  }
+#elif defined(__FreeBSD__)
+  int val = 0;
+  size_t olen = sizeof(val);
+  if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) {
+    os_overcommit = (val != 0);
+  }
+#else
+  // default: overcommit is true
+#endif
+}
+
+void _mi_os_init(void) {
+  // get the page size
+  long result = sysconf(_SC_PAGESIZE);
+  if (result > 0) {
+    os_page_size = (size_t)result;
+    os_alloc_granularity = os_page_size;
+  }
+  large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this?
+  os_detect_overcommit();
+}
+#endif
+
+
+#if defined(MADV_NORMAL)
+static int mi_madvise(void* addr, size_t length, int advice) {
+  #if defined(__sun)
+  return madvise((caddr_t)addr, length, advice);  // Solaris needs cast (issue #520)
+  #else
+  return madvise(addr, length, advice);
+  #endif
+}
+#endif
+
+
+/* -----------------------------------------------------------
+  aligned hinting
+-------------------------------------------------------------- */
+
+// On 64-bit systems, we can do efficient aligned allocation by using
+// the 2TiB to 30TiB area to allocate those.
+#if (MI_INTPTR_SIZE >= 8)
+static mi_decl_cache_align _Atomic(uintptr_t)aligned_base;
+
+// Return a MI_SEGMENT_SIZE aligned address that is probably available.
+// If this returns NULL, the OS will determine the address but on some OS's that may not be
+// properly aligned which can be more costly as it needs to be adjusted afterwards.
+// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization;
+// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses
+//  in the middle of the 2TiB - 6TiB address range (see issue #372))
+
+#define MI_HINT_BASE ((uintptr_t)2 << 40)  // 2TiB start
+#define MI_HINT_AREA ((uintptr_t)4 << 40)  // upto 6TiB   (since before win8 there is "only" 8TiB available to processes)
+#define MI_HINT_MAX  ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages)
+
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size)
+{
+  if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL;
+  size = _mi_align_up(size, MI_SEGMENT_SIZE);
+  if (size > 1*MI_GiB) return NULL;  // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096.
+  #if (MI_SECURE>0)
+  size += MI_SEGMENT_SIZE;        // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas.
+  #endif
+
+  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
+  if (hint == 0 || hint > MI_HINT_MAX) {   // wrap or initialize
+    uintptr_t init = MI_HINT_BASE;
+    #if (MI_SECURE>0 || MI_DEBUG==0)       // security: randomize start of aligned allocations unless in debug mode
+    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+    init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA);  // (randomly 20 bits)*4MiB == 0 to 4TiB
+    #endif
+    uintptr_t expected = hint + size;
+    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all
+  }
+  if (hint%try_alignment != 0) return NULL;
+  return (void*)hint;
+}
+#else
+static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
+  MI_UNUSED(try_alignment); MI_UNUSED(size);
+  return NULL;
+}
+#endif
+
+/* -----------------------------------------------------------
+  Free memory
+-------------------------------------------------------------- */
+
+static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
+{
+  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
+  bool err = false;
+#if defined(_WIN32)
+  DWORD errcode = 0;
+  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
+  if (err) { errcode = GetLastError(); }
+  if (errcode == ERROR_INVALID_ADDRESS) {
+    // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside
+    // the memory region returned by VirtualAlloc; in that case we need to free using
+    // the start of the region.
+    MEMORY_BASIC_INFORMATION info = { 0 };
+    VirtualQuery(addr, &info, sizeof(info));
+    if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) {
+      errcode = 0;
+      err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0);
+      if (err) { errcode = GetLastError(); }
+    }
+  }
+  if (errcode != 0) {
+    _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size);
+  }
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+  err = false; // sbrk heap cannot be shrunk
+#else
+  err = (munmap(addr, size) == -1);
+  if (err) {
+    _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size);
+  }
+#endif
+  if (was_committed) { _mi_stat_decrease(&stats->committed, size); }
+  _mi_stat_decrease(&stats->reserved, size);
+  return !err;
+}
+
+
+/* -----------------------------------------------------------
+  Raw allocation on Windows (VirtualAlloc)
+-------------------------------------------------------------- */
+
+#ifdef _WIN32
+
+#define MEM_COMMIT_RESERVE  (MEM_COMMIT|MEM_RESERVE)
+
+static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
+#if (MI_INTPTR_SIZE >= 8)
+  // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment,size);
+    if (hint != NULL) {
+      void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE);
+      if (p != NULL) return p;
+      _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags);
+      // fall through on error
+    }
+  }
+#endif
+  // on modern Windows try use VirtualAlloc2 for aligned allocation
+  if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
+    MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 };
+    reqs.Alignment = try_alignment;
+    MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} };
+    param.Type.Type = MiMemExtendedParameterAddressRequirements;
+    param.Arg.Pointer = &reqs;
+    void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, &param, 1);
+    if (p != NULL) return p;
+    _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags);
+    // fall through on error
+  }
+  // last resort
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
+  mi_assert_internal(!(large_only && !allow_large));
+  static _Atomic(size_t) large_page_try_ok; // = 0;
+  void* p = NULL;
+  // Try to allocate large OS pages (2MiB) if allowed or required.
+  if ((large_only || use_large_os_page(size, try_alignment))
+      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
+      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      // large OS pages must always reserve and commit.
+      *is_large = true;
+      p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES);
+      if (large_only) return p;
+      // fall back to non-large page allocation on error (`p == NULL`).
+      if (p == NULL) {
+	mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
+      }
+    }
+  }
+  // Fall back to regular page allocation
+  if (p == NULL) {
+    *is_large = ((flags&MEM_LARGE_PAGES) != 0);
+    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large);
+  }
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Raw allocation using `sbrk` or `wasm_memory_grow`
+-------------------------------------------------------------- */
+
+#elif defined(MI_USE_SBRK) || defined(__wasi__)
+#if defined(MI_USE_SBRK)
+  static void* mi_memory_grow( size_t size ) {
+    void* p = sbrk(size);
+    if (p == (void*)(-1)) return NULL;
+    #if !defined(__wasi__) // on wasi this is always zero initialized already (?)
+    memset(p,0,size);
+    #endif
+    return p;
+  }
+#elif defined(__wasi__)
+  static void* mi_memory_grow( size_t size ) {
+    size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size()))
+			    : __builtin_wasm_memory_size(0));
+    if (base == SIZE_MAX) return NULL;
+    return (void*)(base * _mi_os_page_size());
+  }
+#endif
+
+#if defined(MI_USE_PTHREADS)
+static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER;
+#endif
+
+static void* mi_heap_grow(size_t size, size_t try_alignment) {
+  void* p = NULL;
+  if (try_alignment <= 1) {
+    // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    p = mi_memory_grow(size);
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+  }
+  else {
+    void* base = NULL;
+    size_t alloc_size = 0;
+    // to allocate aligned use a lock to try to avoid thread interaction
+    // between getting the current size and actual allocation
+    // (also, `sbrk` is not thread safe in general)
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_lock(&mi_heap_grow_mutex);
+    #endif
+    {
+      void* current = mi_memory_grow(0);  // get current size
+      if (current != NULL) {
+	void* aligned_current = mi_align_up_ptr(current, try_alignment);  // and align from there to minimize wasted space
+	alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size());
+	base = mi_memory_grow(alloc_size);
+      }
+    }
+    #if defined(MI_USE_PTHREADS)
+    pthread_mutex_unlock(&mi_heap_grow_mutex);
+    #endif
+    if (base != NULL) {
+      p = mi_align_up_ptr(base, try_alignment);
+      if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) {
+	// another thread used wasm_memory_grow/sbrk in-between and we do not have enough
+	// space after alignment. Give up (and waste the space as we cannot shrink :-( )
+	// (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align)
+	p = NULL;
+      }
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment);
+    errno = ENOMEM;
+    return NULL;
+  }
+  mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 );
+  return p;
+}
+
+/* -----------------------------------------------------------
+  Raw allocation on Unix's (mmap)
+-------------------------------------------------------------- */
+#else
+#define MI_OS_USE_MMAP
+static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) {
+  MI_UNUSED(try_alignment);
+  #if defined(MAP_ALIGNED)  // BSD
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    size_t n = mi_bsr(try_alignment);
+    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
+      flags |= MAP_ALIGNED(n);
+      void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #elif defined(MAP_ALIGN)  // Solaris
+  if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) {
+    void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0);  // addr parameter is the required alignment
+    if (p!=MAP_FAILED) return p;
+    // fall back to regular mmap
+  }
+  #endif
+  #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
+  // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations
+  if (addr == NULL) {
+    void* hint = mi_os_get_aligned_hint(try_alignment, size);
+    if (hint != NULL) {
+      void* p = mmap(hint, size, protect_flags, flags, fd, 0);
+      if (p!=MAP_FAILED) return p;
+      // fall back to regular mmap
+    }
+  }
+  #endif
+  // regular mmap
+  void* p = mmap(addr, size, protect_flags, flags, fd, 0);
+  if (p!=MAP_FAILED) return p;
+  // failed to allocate
+  return NULL;
+}
+
+static int mi_unix_mmap_fd(void) {
+#if defined(VM_MAKE_TAG)
+  // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99)
+  int os_tag = (int)mi_option_get(mi_option_os_tag);
+  if (os_tag < 100 || os_tag > 255) os_tag = 100;
+  return VM_MAKE_TAG(os_tag);
+#else
+  return -1;
+#endif
+}
+
+static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) {
+  void* p = NULL;
+  #if !defined(MAP_ANONYMOUS)
+  #define MAP_ANONYMOUS  MAP_ANON
+  #endif
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  const int fd = mi_unix_mmap_fd();
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (_mi_os_has_overcommit()) {
+    flags |= MAP_NORESERVE;
+  }
+  #if defined(PROT_MAX)
+  protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD
+  #endif
+  // huge page allocation
+  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
+    static _Atomic(size_t) large_page_try_ok; // = 0;
+    size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
+    if (!large_only && try_ok > 0) {
+      // If the OS is not configured for large OS pages, or the user does not have
+      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
+      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
+      // to avoid too many failing calls to mmap.
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
+    }
+    else {
+      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
+      int lfd = fd;
+      #ifdef MAP_ALIGNED_SUPER
+      lflags |= MAP_ALIGNED_SUPER;
+      #endif
+      #ifdef MAP_HUGETLB
+      lflags |= MAP_HUGETLB;
+      #endif
+      #ifdef MAP_HUGE_1GB
+      static bool mi_huge_pages_available = true;
+      if ((size % MI_GiB) == 0 && mi_huge_pages_available) {
+	lflags |= MAP_HUGE_1GB;
+      }
+      else
+      #endif
+      {
+	#ifdef MAP_HUGE_2MB
+	lflags |= MAP_HUGE_2MB;
+	#endif
+      }
+      #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB
+      lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+      #endif
+      if (large_only || lflags != flags) {
+	// try large OS page allocation
+	*is_large = true;
+	p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+	#ifdef MAP_HUGE_1GB
+	if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+	  mi_huge_pages_available = false; // don't try huge 1GiB pages again
+	  _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
+	  lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
+	  p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
+	}
+	#endif
+	if (large_only) return p;
+	if (p == NULL) {
+	  mi_atomic_store_release(&large_page_try_ok, (size_t)8);  // on error, don't try again for the next N allocations
+	}
+      }
+    }
+  }
+  // regular allocation
+  if (p == NULL) {
+    *is_large = false;
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
+    if (p != NULL) {
+      #if defined(MADV_HUGEPAGE)
+      // Many Linux systems don't allow MAP_HUGETLB but they support instead
+      // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE
+      // though since properly aligned allocations will already use large pages if available
+      // in that case -- in particular for our large regions (in `memory.c`).
+      // However, some systems only allow THP if called with explicit `madvise`, so
+      // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+	if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) {
+	  *is_large = true; // possibly
+	};
+      }
+      #elif defined(__sun)
+      if (allow_large && use_large_os_page(size, try_alignment)) {
+	struct memcntl_mha cmd = {0};
+	cmd.mha_pagesize = large_os_page_size;
+	cmd.mha_cmd = MHA_MAPSIZE_VA;
+	if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+	  *is_large = true;
+	}
+      }
+      #endif
+    }
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
+  }
+  return p;
+}
+#endif
+
+
+/* -----------------------------------------------------------
+   Primitive allocation from the OS.
+-------------------------------------------------------------- */
+
+// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned.
+static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  if (size == 0) return NULL;
+  if (!commit) allow_large = false;
+  if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning
+
+  void* p = NULL;
+  /*
+  if (commit && allow_large) {
+    p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment);
+    if (p != NULL) {
+      *is_large = true;
+      return p;
+    }
+  }
+  */
+
+  #if defined(_WIN32)
+    int flags = MEM_RESERVE;
+    if (commit) { flags |= MEM_COMMIT; }
+    p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large);
+  #elif defined(MI_USE_SBRK) || defined(__wasi__)
+    MI_UNUSED(allow_large);
+    *is_large = false;
+    p = mi_heap_grow(size, try_alignment);
+  #else
+    int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE);
+    p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large);
+  #endif
+  mi_stat_counter_increase(stats->mmap_calls, 1);
+  if (p != NULL) {
+    _mi_stat_increase(&stats->reserved, size);
+    if (commit) { _mi_stat_increase(&stats->committed, size); }
+  }
+  return p;
+}
+
+
+// Primitive aligned allocation from the OS.
+// This function guarantees the allocated memory is aligned.
+static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, mi_stats_t* stats) {
+  mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0));
+  mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0);
+  mi_assert_internal(is_large != NULL);
+  if (!commit) allow_large = false;
+  if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
+  size = _mi_align_up(size, _mi_os_page_size());
+
+  // try first with a hint (this will be aligned directly on Win 10+ or BSD)
+  void* p = mi_os_mem_alloc(size, alignment, commit, allow_large, is_large, stats);
+  if (p == NULL) return NULL;
+
+  // if not aligned, free it, overallocate, and unmap around it
+  if (((uintptr_t)p % alignment != 0)) {
+    mi_os_mem_free(p, size, commit, stats);
+    _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit);
+    if (size >= (SIZE_MAX - alignment)) return NULL; // overflow
+    const size_t over_size = size + alignment;
+
+#if _WIN32
+    // over-allocate uncommitted (virtual) memory
+    p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats);
+    if (p == NULL) return NULL;
+
+    // set p to the aligned part in the full region
+    // note: this is dangerous on Windows as VirtualFree needs the actual region pointer
+    // but in mi_os_mem_free we handle this (hopefully exceptional) situation.
+    p = mi_align_up_ptr(p, alignment);
+
+    // explicitly commit only the aligned part
+    if (commit) {
+      _mi_os_commit(p, size, NULL, stats);
+    }
+#else
+    // overallocate...
+    p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats);
+    if (p == NULL) return NULL;
+    // and selectively unmap parts around the over-allocated area. (noop on sbrk)
+    void* aligned_p = mi_align_up_ptr(p, alignment);
+    size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p;
+    size_t mid_size = _mi_align_up(size, _mi_os_page_size());
+    size_t post_size = over_size - pre_size - mid_size;
+    mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size);
+    if (pre_size > 0)  mi_os_mem_free(p, pre_size, commit, stats);
+    if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats);
+    // we can return the aligned pointer on `mmap` (and sbrk) systems
+    p = aligned_p;
+#endif
+  }
+
+  mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0));
+  return p;
+}
+
+
+/* -----------------------------------------------------------
+  OS API: alloc, free, alloc_aligned
+----------------------------------------------------------- */
+
+void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  bool is_large = false;
+  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
+}
+
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  if (size == 0 || p == NULL) return;
+  size = _mi_os_good_alloc_size(size);
+  mi_os_mem_free(p, size, was_committed, stats);
+}
+
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
+  _mi_os_free_ex(p, size, true, stats);
+}
+
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
+{
+  MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings
+  MI_UNUSED(tld_stats);
+  if (size == 0) return NULL;
+  size = _mi_os_good_alloc_size(size);
+  alignment = _mi_align_up(alignment, _mi_os_page_size());
+  bool allow_large = false;
+  if (large != NULL) {
+    allow_large = *large;
+    *large = false;
+  }
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
+}
+
+/* -----------------------------------------------------------
+  OS aligned allocation with an offset. This is used
+  for large alignments > MI_ALIGNMENT_MAX. We use a large mimalloc
+  page where the object can be aligned at an offset from the start of the segment.
+  As we may need to overallocate, we need to free such pointers using `mi_free_aligned`
+  to use the actual start of the memory region.
+----------------------------------------------------------- */
+
+void* _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t offset, bool commit, bool* large, mi_stats_t* tld_stats) {
+  mi_assert(offset <= MI_SEGMENT_SIZE);
+  mi_assert(offset <= size);
+  mi_assert((alignment % _mi_os_page_size()) == 0);
+  if (offset > MI_SEGMENT_SIZE) return NULL;
+  if (offset == 0) {
+    // regular aligned allocation
+    return _mi_os_alloc_aligned(size, alignment, commit, large, tld_stats);
+  }
+  else {
+    // overallocate to align at an offset
+    const size_t extra = _mi_align_up(offset, alignment) - offset;
+    const size_t oversize = size + extra;
+    void* start = _mi_os_alloc_aligned(oversize, alignment, commit, large, tld_stats);
+    if (start == NULL) return NULL;
+    void* p = (uint8_t*)start + extra;
+    mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment));
+    // decommit the overallocation at the start
+    if (commit && extra > _mi_os_page_size()) {
+      _mi_os_decommit(start, extra, tld_stats);
+    }
+    return p;
+  }
+}
+
+void _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats) {
+  mi_assert(align_offset <= MI_SEGMENT_SIZE);
+  const size_t extra = _mi_align_up(align_offset, alignment) - align_offset;
+  void* start = (uint8_t*)p - extra;
+  _mi_os_free_ex(start, size + extra, was_committed, tld_stats);
+}
+
+/* -----------------------------------------------------------
+  OS memory API: reset, commit, decommit, protect, unprotect.
+----------------------------------------------------------- */
+
+
+// OS page align within a given area, either conservative (pages inside the area only),
+// or not (straddling pages outside the area is possible)
+static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) {
+  mi_assert(addr != NULL && size > 0);
+  if (newsize != NULL) *newsize = 0;
+  if (size == 0 || addr == NULL) return NULL;
+
+  // page align conservatively within the range
+  void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size())
+    : mi_align_down_ptr(addr, _mi_os_page_size()));
+  void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
+    : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size()));
+  ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start;
+  if (diff <= 0) return NULL;
+
+  mi_assert_internal((conservative && (size_t)diff <= size) || (!conservative && (size_t)diff >= size));
+  if (newsize != NULL) *newsize = (size_t)diff;
+  return start;
+}
+
+static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* newsize) {
+  return mi_os_page_align_areax(true, addr, size, newsize);
+}
+
+static void mi_mprotect_hint(int err) {
+#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page
+  if (err == ENOMEM) {
+    _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n"
+			"  On Linux this is controlled by the vm.max_map_count. For example:\n"
+			"  > sudo sysctl -w vm.max_map_count=262144\n");
+  }
+#else
+  MI_UNUSED(err);
+#endif
+}
+
+// Commit/Decommit memory.
+// Usually commit is aligned liberal, while decommit is aligned conservative.
+// (but not for the reset version where we want commit to be conservative as well)
+static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservative, bool* is_zero, mi_stats_t* stats) {
+  // page align in the range, commit liberally, decommit conservative
+  if (is_zero != NULL) { *is_zero = false; }
+  size_t csize;
+  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
+  int err = 0;
+  if (commit) {
+    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
+    _mi_stat_counter_increase(&stats->commit_calls, 1);
+  }
+  else {
+    _mi_stat_decrease(&stats->committed, size);
+  }
+
+  #if defined(_WIN32)
+  if (commit) {
+    // *is_zero = true;  // note: if the memory was already committed, the call succeeds but the memory is not zero'd
+    void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE);
+    err = (p == start ? 0 : GetLastError());
+  }
+  else {
+    BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT);
+    err = (ok ? 0 : GetLastError());
+  }
+  #elif defined(__wasi__)
+  // WebAssembly guests can't control memory protection
+  #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__)
+  // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?)
+  if (commit) {
+    // commit: just change the protection
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
+  else {
+    // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss)
+    const int fd = mi_unix_mmap_fd();
+    void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0);
+    if (p != start) { err = errno; }
+  }
+  #else
+  // Linux, macOSX and others.
+  if (commit) {
+    // commit: ensure we can access the area
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
+  else {
+    #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0
+    // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE)
+    // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( )
+    err = madvise(start, csize, MADV_DONTNEED);
+    #else
+    // decommit: just disable access (also used in debug and secure mode to trap on illegal access)
+    err = mprotect(start, csize, PROT_NONE);
+    if (err != 0) { err = errno; }
+    #endif
+    //#if defined(MADV_FREE_REUSE)
+    //  while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
+    //#endif
+  }
+  #endif
+  if (err != 0) {
+    _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
+    mi_mprotect_hint(err);
+  }
+  mi_assert_internal(err == 0);
+  return (err == 0);
+}
+
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
+}
+
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  bool is_zero;
+  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
+}
+
+/*
+static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+  return mi_os_commitx(addr, size, true, true // conservative
+		      , is_zero, stats);
+}
+*/
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) {
+  // page align conservatively within the range
+  size_t csize;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
+  if (reset) _mi_stat_increase(&stats->reset, csize);
+	else _mi_stat_decrease(&stats->reset, csize);
+  if (!reset) return true; // nothing to do on unreset!
+
+  #if (MI_DEBUG>1) && !MI_TRACK_ENABLED
+  if (MI_SECURE==0) {
+    memset(start, 0, csize); // pretend it is eagerly reset
+  }
+  #endif
+
+#if defined(_WIN32)
+  // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory
+  void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE);
+  mi_assert_internal(p == start);
+  #if 1
+  if (p == start && start != NULL) {
+    VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set
+  }
+  #endif
+  if (p != start) return false;
+#else
+#if defined(MADV_FREE)
+  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
+  while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) {
+    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED);
+    err = mi_madvise(start, csize, MADV_DONTNEED);
+  }
+#elif defined(__wasi__)
+  int err = 0;
+#else
+  int err = mi_madvise(start, csize, MADV_DONTNEED);
+#endif
+  if (err != 0) {
+    _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno);
+  }
+  //mi_assert(err == 0);
+  if (err != 0) return false;
+#endif
+  return true;
+}
+
+// Signal to the OS that the address range is no longer in use
+// but may be used later again. This will release physical memory
+// pages and reduce swapping while keeping the memory committed.
+// We page align to a conservative area inside the range to reset.
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  return mi_os_resetx(addr, size, true, stats);
+}
+
+/*
+bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  MI_UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
+  *is_zero = false;
+  return mi_os_resetx(addr, size, false, stats);
+}
+*/
+
+// Protect a region in memory to be not accessible.
+static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
+  // page align conservatively within the range
+  size_t csize = 0;
+  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
+  if (csize == 0) return false;
+  /*
+  if (_mi_os_is_huge_reserved(addr)) {
+	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+  }
+  */
+  int err = 0;
+#ifdef _WIN32
+  DWORD oldprotect = 0;
+  BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect);
+  err = (ok ? 0 : GetLastError());
+#elif defined(__wasi__)
+  err = 0;
+#else
+  err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE));
+  if (err != 0) { err = errno; }
+#endif
+  if (err != 0) {
+    _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err);
+    mi_mprotect_hint(err);
+  }
+  return (err == 0);
+}
+
+bool _mi_os_protect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, true);
+}
+
+bool _mi_os_unprotect(void* addr, size_t size) {
+  return mi_os_protectx(addr, size, false);
+}
+
+
+
+bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
+  // page align conservatively within the range
+  mi_assert_internal(oldsize > newsize && p != NULL);
+  if (oldsize < newsize || p == NULL) return false;
+  if (oldsize == newsize) return true;
+
+  // oldsize and newsize should be page aligned or we cannot shrink precisely
+  void* addr = (uint8_t*)p + newsize;
+  size_t size = 0;
+  void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size);
+  if (size == 0 || start != addr) return false;
+
+#ifdef _WIN32
+  // we cannot shrink on windows, but we can decommit
+  return _mi_os_decommit(start, size, stats);
+#else
+  return mi_os_mem_free(start, size, true, stats);
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------
+Support for allocating huge OS pages (1Gib) that are reserved up-front
+and possibly associated with a specific NUMA node. (use `numa_node>=0`)
+-----------------------------------------------------------------------------*/
+#define MI_HUGE_OS_PAGE_SIZE  (MI_GiB)
+
+#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8)
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
+{
+  mi_assert_internal(size%MI_GiB == 0);
+  mi_assert_internal(addr != NULL);
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
+
+  mi_win_enable_large_os_pages();
+
+  MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} };
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
+    params[0].Type.Type = MiMemExtendedParameterAttributeFlags;
+    params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    ULONG param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type.Type = MiMemExtendedParameterNumaNode;
+      params[1].Arg.ULong = (unsigned)numa_node;
+    }
+    SIZE_T psize = size;
+    void* base = addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
+    }
+    else {
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
+    }
+  }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type.Type = MiMemExtendedParameterNumaNode;
+    params[0].Arg.ULong = (unsigned)numa_node;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}
+
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
+#include <sys/syscall.h>
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+#if defined(SYS_mbind)
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags);
+  return 0;
+}
+#endif
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  mi_assert_internal(size%MI_GiB == 0);
+  bool is_large = true;
+  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (p == NULL) return NULL;
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    unsigned long numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno));
+    }
+  }
+  return p;
+}
+#else
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node);
+  return NULL;
+}
+#endif
+
+#if (MI_INTPTR_SIZE >= 8)
+// To ensure proper alignment, use our own area for huge OS pages
+static mi_decl_cache_align _Atomic(uintptr_t)  mi_huge_start; // = 0
+
+// Claim an aligned address range for huge pages
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
+  const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+
+  uintptr_t start = 0;
+  uintptr_t end = 0;
+  uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start);
+  do {
+    start = huge_start;
+    if (start == 0) {
+      // Initialize the start address after the 32TiB area
+      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF));  // (randomly 12bits)*1GiB == between 0 to 4TiB
+#endif
+    }
+    end = start + size;
+    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
+  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));
+
+  if (total_size != NULL) *total_size = size;
+  return (uint8_t*)start;
+}
+#else
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  MI_UNUSED(pages);
+  if (total_size != NULL) *total_size = 0;
+  return NULL;
+}
+#endif
+
+// Allocate MI_SEGMENT_SIZE aligned huge pages
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  size_t size = 0;
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  if (start == NULL) return NULL; // or 32-bit systems
+
+  // Allocate one page at the time but try to place them contiguously
+  // We allocate one page at the time to be able to abort if it takes too long
+  // or to at least allocate as many as available on the system.
+  mi_msecs_t start_t = _mi_clock_start();
+  size_t page;
+  for (page = 0; page < pages; page++) {
+    // allocate a page
+    void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
+    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+
+    // Did we succeed at a contiguous address?
+    if (p != addr) {
+      // no success, issue a warning and break
+      if (p != NULL) {
+	_mi_warning_message("could not allocate contiguous huge page %zu at %p\n", page, addr);
+	_mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+      }
+      break;
+    }
+
+    // success, record it
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+
+    // check for timeout
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+	mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+	if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+	  elapsed = max_msecs + 1;
+	}
+      }
+      if (elapsed > max_msecs) {
+	_mi_warning_message("huge page allocation timed out\n");
+	break;
+      }
+    }
+  }
+  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
+  if (pages_reserved != NULL) { *pages_reserved = page; }
+  if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; }
+  return (page == 0 ? NULL : start);
+}
+
+// free every huge page in a range individually (as we allocated per page)
+// note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
+void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+  if (p==NULL || size==0) return;
+  uint8_t* base = (uint8_t*)p;
+  while (size >= MI_HUGE_OS_PAGE_SIZE) {
+    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    size -= MI_HUGE_OS_PAGE_SIZE;
+    base += MI_HUGE_OS_PAGE_SIZE;
+  }
+}
+
+/* ----------------------------------------------------------------------------
+Support NUMA aware allocation
+-----------------------------------------------------------------------------*/
+#ifdef _WIN32
+static size_t mi_os_numa_nodex(void) {
+  USHORT numa_node = 0;
+  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
+    // Extended API is supported
+    MI_PROCESSOR_NUMBER pnum;
+    (*pGetCurrentProcessorNumberEx)(&pnum);
+    USHORT nnode = 0;
+    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  else if (pGetNumaProcessorNode != NULL) {
+    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
+    DWORD pnum = GetCurrentProcessorNumber();
+    UCHAR nnode = 0;
+    BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) { numa_node = nnode; }
+  }
+  return numa_node;
+}
+
+static size_t mi_os_numa_node_countx(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  // find the highest node number that has actual processors assigned to it. Issue #282
+  while(numa_max > 0) {
+    if (pGetNumaNodeProcessorMaskEx != NULL) {
+      // Extended API is supported
+      GROUP_AFFINITY affinity;
+      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
+	if (affinity.Mask != 0) break;  // found the maximum non-empty node
+      }
+    }
+    else {
+      // Vista or earlier, use older API that is limited to 64 processors.
+      ULONGLONG mask;
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+	if (mask != 0) break; // found the maximum non-empty node
+      };
+    }
+    // max node was invalid or had no processor assigned, try again
+    numa_max--;
+  }
+  return ((size_t)numa_max + 1);
+}
+#elif defined(__linux__)
+#include <sys/syscall.h>  // getcpu
+#include <stdio.h>        // access
+
+static size_t mi_os_numa_nodex(void) {
+#ifdef SYS_getcpu
+  unsigned long node = 0;
+  unsigned long ncpu = 0;
+  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+  if (err != 0) return 0;
+  return node;
+#else
+  return 0;
+#endif
+}
+static size_t mi_os_numa_node_countx(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000
+static size_t mi_os_numa_nodex(void) {
+  domainset_t dom;
+  size_t node;
+  int policy;
+  if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul;
+  for (node = 0; node < MAXMEMDOM; node++) {
+    if (DOMAINSET_ISSET(node, &dom)) return node;
+  }
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ndomains = 0;
+  size_t len = sizeof(ndomains);
+  if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul;
+  return ndomains;
+}
+#elif defined(__DragonFly__)
+static size_t mi_os_numa_nodex(void) {
+  // TODO: DragonFly does not seem to provide any userland means to get this information.
+  return 0ul;
+}
+static size_t mi_os_numa_node_countx(void) {
+  size_t ncpus = 0, nvirtcoresperphys = 0;
+  size_t len = sizeof(size_t);
+  if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul;
+  if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul;
+  return nvirtcoresperphys * ncpus;
+}
+#else
+static size_t mi_os_numa_nodex(void) {
+  return 0;
+}
+static size_t mi_os_numa_node_countx(void) {
+  return 1;
+}
+#endif
+
+_Atomic(size_t)  _mi_numa_node_count; // = 0   // cache the node count
+
+size_t _mi_os_numa_node_count_get(void) {
+  size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
+  if (count <= 0) {
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount > 0) {
+      count = (size_t)ncount;
+    }
+    else {
+      count = mi_os_numa_node_countx(); // or detect dynamically
+      if (count == 0) count = 1;
+    }
+    mi_atomic_store_release(&_mi_numa_node_count, count); // save it
+    _mi_verbose_message("using %zd numa regions\n", count);
+  }
+  return count;
+}
+
+int _mi_os_numa_node_get(mi_os_tld_t* tld) {
+  MI_UNUSED(tld);
+  size_t numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  size_t numa_node = mi_os_numa_nodex();
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  return (int)numa_node;
+}
diff --git a/compat/mimalloc/page-queue.c b/compat/mimalloc/page-queue.c
new file mode 100644
index 0000000000..5619a81f99
--- /dev/null
+++ b/compat/mimalloc/page-queue.c
@@ -0,0 +1,332 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#ifndef MI_IN_PAGE_C
+#error "this file should be included from 'page.c'"
+#endif
+
+/* -----------------------------------------------------------
+  Minimal alignment in machine words (i.e. `sizeof(void*)`)
+----------------------------------------------------------- */
+
+#if (MI_MAX_ALIGN_SIZE > 4*MI_INTPTR_SIZE)
+  #error "define alignment for more than 4x word size for this platform"
+#elif (MI_MAX_ALIGN_SIZE > 2*MI_INTPTR_SIZE)
+  #define MI_ALIGN4W   // 4 machine words minimal alignment
+#elif (MI_MAX_ALIGN_SIZE > MI_INTPTR_SIZE)
+  #define MI_ALIGN2W   // 2 machine words minimal alignment
+#else
+  // ok, default alignment is 1 word
+#endif
+
+
+/* -----------------------------------------------------------
+  Queue query
+----------------------------------------------------------- */
+
+
+static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+sizeof(uintptr_t)));
+}
+
+static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) {
+  return (pq->block_size == (MI_MEDIUM_OBJ_SIZE_MAX+(2*sizeof(uintptr_t))));
+}
+
+static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
+  return (pq->block_size > MI_MEDIUM_OBJ_SIZE_MAX);
+}
+
+/* -----------------------------------------------------------
+  Bins
+----------------------------------------------------------- */
+
+// Return the bin for a given field size.
+// Returns MI_BIN_HUGE if the size is too large.
+// We use `wsize` for the size in "machine word sizes",
+// i.e. byte size == `wsize*sizeof(void*)`.
+static inline uint8_t mi_bin(size_t size) {
+  size_t wsize = _mi_wsize_from_size(size);
+  uint8_t bin;
+  if (wsize <= 1) {
+    bin = 1;
+  }
+  #if defined(MI_ALIGN4W)
+  else if (wsize <= 4) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #elif defined(MI_ALIGN2W)
+  else if (wsize <= 8) {
+    bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
+  }
+  #else
+  else if (wsize <= 8) {
+    bin = (uint8_t)wsize;
+  }
+  #endif
+  else if (wsize > MI_MEDIUM_OBJ_WSIZE_MAX) {
+    bin = MI_BIN_HUGE;
+  }
+  else {
+    #if defined(MI_ALIGN4W)
+    if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
+    #endif
+    wsize--;
+    // find the highest bit
+    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
+    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
+    // - adjust with 3 because we use do not round the first 8 sizes
+    //   which each get an exact bin
+    bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin < MI_BIN_HUGE);
+  }
+  mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
+  return bin;
+}
+
+
+
+/* -----------------------------------------------------------
+  Queue of pages with free blocks
+----------------------------------------------------------- */
+
+uint8_t _mi_bin(size_t size) {
+  return mi_bin(size);
+}
+
+size_t _mi_bin_size(uint8_t bin) {
+  return _mi_heap_empty.pages[bin].block_size;
+}
+
+// Good size for allocation
+size_t mi_good_size(size_t size) mi_attr_noexcept {
+  if (size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    return _mi_bin_size(mi_bin(size));
+  }
+  else {
+    return _mi_align_up(size,_mi_os_page_size());
+  }
+}
+
+#if (MI_DEBUG>1)
+static bool mi_page_queue_contains(mi_page_queue_t* queue, const mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_page_t* list = queue->first;
+  while (list != NULL) {
+    mi_assert_internal(list->next == NULL || list->next->prev == list);
+    mi_assert_internal(list->prev == NULL || list->prev->next == list);
+    if (list == page) break;
+    list = list->next;
+  }
+  return (list == page);
+}
+
+#endif
+
+#if (MI_DEBUG>1)
+static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* pq) {
+  return (pq >= &heap->pages[0] && pq <= &heap->pages[MI_BIN_FULL]);
+}
+#endif
+
+static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_assert_internal(heap != NULL && bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(bin >= MI_BIN_HUGE || page->xblock_size == pq->block_size);
+  mi_assert_expensive(mi_page_queue_contains(pq, page));
+  return pq;
+}
+
+static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) {
+  uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : mi_bin(page->xblock_size));
+  mi_assert_internal(bin <= MI_BIN_FULL);
+  mi_page_queue_t* pq = &heap->pages[bin];
+  mi_assert_internal(mi_page_is_in_full(page) || page->xblock_size == pq->block_size);
+  return pq;
+}
+
+// The current small page array is for efficiency and for each
+// small size (up to 256) it points directly to the page for that
+// size without having to compute the bin. This means when the
+// current free page queue is updated for a small bin, we need to update a
+// range of entries in `_mi_page_small_free`.
+static inline void mi_heap_queue_first_update(mi_heap_t* heap, const mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  size_t size = pq->block_size;
+  if (size > MI_SMALL_SIZE_MAX) return;
+
+  mi_page_t* page = pq->first;
+  if (pq->first == NULL) page = (mi_page_t*)&_mi_page_empty;
+
+  // find index in the right direct page array
+  size_t start;
+  size_t idx = _mi_wsize_from_size(size);
+  mi_page_t** pages_free = heap->pages_free_direct;
+
+  if (pages_free[idx] == page) return;  // already set
+
+  // find start slot
+  if (idx<=1) {
+    start = 0;
+  }
+  else {
+    // find previous size; due to minimal alignment upto 3 previous bins may need to be skipped
+    uint8_t bin = mi_bin(size);
+    const mi_page_queue_t* prev = pq - 1;
+    while( bin == mi_bin(prev->block_size) && prev > &heap->pages[0]) {
+      prev--;
+    }
+    start = 1 + _mi_wsize_from_size(prev->block_size);
+    if (start > idx) start = idx;
+  }
+
+  // set size range to the right page
+  mi_assert(start <= idx);
+  for (size_t sz = start; sz <= idx; sz++) {
+    pages_free[sz] = page;
+  }
+}
+
+/*
+static bool mi_page_queue_is_empty(mi_page_queue_t* queue) {
+  return (queue->first == NULL);
+}
+*/
+
+static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(queue, page));
+  mi_assert_internal(page->xblock_size == queue->block_size || (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX && mi_page_queue_is_huge(queue))  || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+  mi_heap_t* heap = mi_page_heap(page);
+
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == queue->last)  queue->last = page->prev;
+  if (page == queue->first) {
+    queue->first = page->next;
+    // update first
+    mi_assert_internal(mi_heap_contains_queue(heap, queue));
+    mi_heap_queue_first_update(heap,queue);
+  }
+  heap->page_count--;
+  page->next = NULL;
+  page->prev = NULL;
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
+  mi_page_set_in_full(page,false);
+}
+
+
+static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) {
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(!mi_page_queue_contains(queue, page));
+  #if MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  #endif
+  mi_assert_internal(page->xblock_size == queue->block_size ||
+		      (page->xblock_size > MI_MEDIUM_OBJ_SIZE_MAX) ||
+			(mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
+  page->next = queue->first;
+  page->prev = NULL;
+  if (queue->first != NULL) {
+    mi_assert_internal(queue->first->prev == NULL);
+    queue->first->prev = page;
+    queue->first = page;
+  }
+  else {
+    queue->first = queue->last = page;
+  }
+
+  // update direct
+  mi_heap_queue_first_update(heap, queue);
+  heap->page_count++;
+}
+
+
+static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(mi_page_queue_contains(from, page));
+  mi_assert_expensive(!mi_page_queue_contains(to, page));
+
+  mi_assert_internal((page->xblock_size == to->block_size && page->xblock_size == from->block_size) ||
+		     (page->xblock_size == to->block_size && mi_page_queue_is_full(from)) ||
+		     (page->xblock_size == from->block_size && mi_page_queue_is_full(to)) ||
+		     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_huge(to)) ||
+		     (page->xblock_size > MI_LARGE_OBJ_SIZE_MAX && mi_page_queue_is_full(to)));
+
+  mi_heap_t* heap = mi_page_heap(page);
+  if (page->prev != NULL) page->prev->next = page->next;
+  if (page->next != NULL) page->next->prev = page->prev;
+  if (page == from->last)  from->last = page->prev;
+  if (page == from->first) {
+    from->first = page->next;
+    // update first
+    mi_assert_internal(mi_heap_contains_queue(heap, from));
+    mi_heap_queue_first_update(heap, from);
+  }
+
+  page->prev = to->last;
+  page->next = NULL;
+  if (to->last != NULL) {
+    mi_assert_internal(heap == mi_page_heap(to->last));
+    to->last->next = page;
+    to->last = page;
+  }
+  else {
+    to->first = page;
+    to->last = page;
+    mi_heap_queue_first_update(heap, to);
+  }
+
+  mi_page_set_in_full(page, mi_page_queue_is_full(to));
+}
+
+// Only called from `mi_heap_absorb`.
+size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) {
+  mi_assert_internal(mi_heap_contains_queue(heap,pq));
+  mi_assert_internal(pq->block_size == append->block_size);
+
+  if (append->first==NULL) return 0;
+
+  // set append pages to new heap and count
+  size_t count = 0;
+  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
+    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
+    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
+    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
+    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
+    // that after appending only the new heap will be used for delayed free operations.
+    _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false);
+    count++;
+  }
+
+  if (pq->last==NULL) {
+    // take over afresh
+    mi_assert_internal(pq->first==NULL);
+    pq->first = append->first;
+    pq->last = append->last;
+    mi_heap_queue_first_update(heap, pq);
+  }
+  else {
+    // append to end
+    mi_assert_internal(pq->last!=NULL);
+    mi_assert_internal(append->first!=NULL);
+    pq->last->next = append->first;
+    append->first->prev = pq->last;
+    pq->last = append->last;
+  }
+  return count;
+}
diff --git a/compat/mimalloc/page.c b/compat/mimalloc/page.c
new file mode 100644
index 0000000000..1760135545
--- /dev/null
+++ b/compat/mimalloc/page.c
@@ -0,0 +1,926 @@
+/*----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* -----------------------------------------------------------
+  The core of the allocator. Every segment contains
+  pages of a certain block size. The main function
+  exported is `mi_malloc_generic`.
+----------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+/* -----------------------------------------------------------
+  Definition of page queues for each block size
+----------------------------------------------------------- */
+
+#define MI_IN_PAGE_C
+#include "page-queue.c"
+#undef MI_IN_PAGE_C
+
+
+/* -----------------------------------------------------------
+  Page helpers
+----------------------------------------------------------- */
+
+// Index a block in a page
+static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_start, size_t block_size, size_t i) {
+  MI_UNUSED(page);
+  mi_assert_internal(page != NULL);
+  mi_assert_internal(i <= page->reserved);
+  return (mi_block_t*)((uint8_t*)page_start + (i * block_size));
+}
+
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld);
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld);
+
+#if (MI_DEBUG>=3)
+static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  size_t count = 0;
+  while (head != NULL) {
+    mi_assert_internal(page == _mi_ptr_page(head));
+    count++;
+    head = mi_block_next(page, head);
+  }
+  return count;
+}
+
+/*
+// Start of the page available memory
+static inline uint8_t* mi_page_area(const mi_page_t* page) {
+  return _mi_page_start(_mi_page_segment(page), page, NULL);
+}
+*/
+
+static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
+  size_t psize;
+  uint8_t* page_area = _mi_page_start(_mi_page_segment(page), page, &psize);
+  mi_block_t* start = (mi_block_t*)page_area;
+  mi_block_t* end   = (mi_block_t*)(page_area + psize);
+  while(p != NULL) {
+    if (p < start || p >= end) return false;
+    p = mi_block_next(page, p);
+  }
+  return true;
+}
+
+static bool mi_page_is_valid_init(mi_page_t* page) {
+  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(page->used <= page->capacity);
+  mi_assert_internal(page->capacity <= page->reserved);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  uint8_t* start = _mi_page_start(segment,page,NULL);
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
+  //const size_t bsize = mi_page_block_size(page);
+  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
+
+  mi_assert_internal(mi_page_list_is_valid(page,page->free));
+  mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
+
+  #if MI_DEBUG>3 // generally too expensive to check this
+  if (page->is_zero) {
+    const size_t ubsize = mi_page_usable_block_size(page);
+    for(mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) {
+      mi_assert_expensive(mi_mem_is_zero(block + 1, ubsize - sizeof(mi_block_t)));
+    }
+  }
+  #endif
+
+  mi_block_t* tfree = mi_page_thread_free(page);
+  mi_assert_internal(mi_page_list_is_valid(page, tfree));
+  //size_t tfree_count = mi_page_list_count(page, tfree);
+  //mi_assert_internal(tfree_count <= page->thread_freed + 1);
+
+  size_t free_count = mi_page_list_count(page, page->free) + mi_page_list_count(page, page->local_free);
+  mi_assert_internal(page->used + free_count == page->capacity);
+
+  return true;
+}
+
+bool _mi_page_is_valid(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_valid_init(page));
+  #if MI_SECURE
+  mi_assert_internal(page->keys[0] != 0);
+  #endif
+  if (mi_page_heap(page)!=NULL) {
+    mi_segment_t* segment = _mi_page_segment(page);
+
+    mi_assert_internal(!_mi_process_is_initialized || segment->thread_id==0 || segment->thread_id == mi_page_heap(page)->thread_id);
+    #if MI_HUGE_PAGE_ABANDON
+    if (segment->kind != MI_SEGMENT_HUGE)
+    #endif
+    {
+      mi_page_queue_t* pq = mi_page_queue_of(page);
+      mi_assert_internal(mi_page_queue_contains(pq, page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_MEDIUM_OBJ_SIZE_MAX || mi_page_is_in_full(page));
+      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
+    }
+  }
+  return true;
+}
+#endif
+
+void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
+  while (!_mi_page_try_use_delayed_free(page, delay, override_never)) {
+    mi_atomic_yield();
+  }
+}
+
+bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
+  mi_thread_free_t tfreex;
+  mi_delayed_t     old_delay;
+  mi_thread_free_t tfree;
+  size_t yield_count = 0;
+  do {
+    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
+    tfreex = mi_tf_set_delayed(tfree, delay);
+    old_delay = mi_tf_delayed(tfree);
+    if mi_unlikely(old_delay == MI_DELAYED_FREEING) {
+      if (yield_count >= 4) return false;  // give up after 4 tries
+      yield_count++;
+      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
+    }
+    else if (delay == old_delay) {
+      break; // avoid atomic operation if already equal
+    }
+    else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) {
+      break; // leave never-delayed flag set
+    }
+  } while ((old_delay == MI_DELAYED_FREEING) ||
+	   !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
+
+  return true; // success
+}
+
+/* -----------------------------------------------------------
+  Page collect the `local_free` and `thread_free` lists
+----------------------------------------------------------- */
+
+// Collect the local `thread_free` list using an atomic exchange.
+// Note: The exchange must be done atomically as this is used right after
+// moving to the full list in `mi_page_collect_ex` and we need to
+// ensure that there was no race where the page became unfull just before the move.
+static void _mi_page_thread_free_collect(mi_page_t* page)
+{
+  mi_block_t* head;
+  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
+  do {
+    head = mi_tf_block(tfree);
+    tfreex = mi_tf_set_block(tfree,NULL);
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));
+
+  // return if the list is empty
+  if (head == NULL) return;
+
+  // find the tail -- also to get a proper count (without data races)
+  uint32_t max_count = page->capacity; // cannot collect more than capacity
+  uint32_t count = 1;
+  mi_block_t* tail = head;
+  mi_block_t* next;
+  while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) {
+    count++;
+    tail = next;
+  }
+  // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free)
+  if (count > max_count) {
+    _mi_error_message(EFAULT, "corrupted thread-free list\n");
+    return; // the thread-free items cannot be freed
+  }
+
+  // and append the current local free list
+  mi_block_set_next(page,tail, page->local_free);
+  page->local_free = head;
+
+  // update counts now
+  page->used -= count;
+}
+
+void _mi_page_free_collect(mi_page_t* page, bool force) {
+  mi_assert_internal(page!=NULL);
+
+  // collect the thread free list
+  if (force || mi_page_thread_free(page) != NULL) {  // quick test to avoid an atomic operation
+    _mi_page_thread_free_collect(page);
+  }
+
+  // and the local free list
+  if (page->local_free != NULL) {
+    if mi_likely(page->free == NULL) {
+      // usual case
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+    else if (force) {
+      // append -- only on shutdown (force) as this is a linear operation
+      mi_block_t* tail = page->local_free;
+      mi_block_t* next;
+      while ((next = mi_block_next(page, tail)) != NULL) {
+	tail = next;
+      }
+      mi_block_set_next(page, tail, page->free);
+      page->free = page->local_free;
+      page->local_free = NULL;
+      page->is_zero = false;
+    }
+  }
+
+  mi_assert_internal(!force || page->local_free == NULL);
+}
+
+
+
+/* -----------------------------------------------------------
+  Page fresh and retire
+----------------------------------------------------------- */
+
+// called from segments when reclaiming abandoned pages
+void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  mi_assert_internal(mi_page_heap(page) == heap);
+  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);
+  #if MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  #endif
+  mi_assert_internal(!page->is_reset);
+  // TODO: push on full queue immediately if it is full?
+  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_push(heap, pq, page);
+  mi_assert_expensive(_mi_page_is_valid(page));
+}
+
+// allocate a fresh page from a segment
+static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) {
+  #if !MI_HUGE_PAGE_ABANDON
+  mi_assert_internal(pq != NULL);
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_assert_internal(page_alignment > 0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || block_size == pq->block_size);
+  #endif
+  mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os);
+  if (page == NULL) {
+    // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue)
+    return NULL;
+  }
+  mi_assert_internal(page_alignment >0 || block_size > MI_MEDIUM_OBJ_SIZE_MAX || _mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+  mi_assert_internal(pq!=NULL || page->xblock_size != 0);
+  mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size);
+  // a fresh page was found, initialize it
+  const size_t full_block_size = ((pq == NULL || mi_page_queue_is_huge(pq)) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc
+  mi_assert_internal(full_block_size >= block_size);
+  mi_page_init(heap, page, full_block_size, heap->tld);
+  mi_heap_stat_increase(heap, pages, 1);
+  if (pq != NULL) { mi_page_queue_push(heap, pq, page); }
+  mi_assert_expensive(_mi_page_is_valid(page));
+  return page;
+}
+
+// Get a fresh page to use
+static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
+  mi_assert_internal(mi_heap_contains_queue(heap, pq));
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
+  if (page==NULL) return NULL;
+  mi_assert_internal(pq->block_size==mi_page_block_size(page));
+  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  return page;
+}
+
+/* -----------------------------------------------------------
+   Do any delayed frees
+   (put there by other threads if they deallocated in a full page)
+----------------------------------------------------------- */
+void _mi_heap_delayed_free_all(mi_heap_t* heap) {
+  while (!_mi_heap_delayed_free_partial(heap)) {
+    mi_atomic_yield();
+  }
+}
+
+// returns true if all delayed frees were processed
+bool _mi_heap_delayed_free_partial(mi_heap_t* heap) {
+  // take over the list (note: no atomic exchange since it is often NULL)
+  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };
+  bool all_freed = true;
+
+  // and free them all
+  while(block != NULL) {
+    mi_block_t* next = mi_block_nextx(heap,block, heap->keys);
+    // use internal free instead of regular one to keep stats etc correct
+    if (!_mi_free_delayed_block(block)) {
+      // we might already start delayed freeing while another thread has not yet
+      // reset the delayed_freeing flag; in that case delay it further by reinserting the current block
+      // into the delayed free list
+      all_freed = false;
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+      do {
+	mi_block_set_nextx(heap, block, dfree, heap->keys);
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
+    }
+    block = next;
+  }
+  return all_freed;
+}
+
+/* -----------------------------------------------------------
+  Unfull, abandon, free and retire
+----------------------------------------------------------- */
+
+// Move a page from the full list back to a regular list
+void _mi_page_unfull(mi_page_t* page) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_is_in_full(page));
+  if (!mi_page_is_in_full(page)) return;
+
+  mi_heap_t* heap = mi_page_heap(page);
+  mi_page_queue_t* pqfull = &heap->pages[MI_BIN_FULL];
+  mi_page_set_in_full(page, false); // to get the right queue
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
+  mi_page_set_in_full(page, true);
+  mi_page_queue_enqueue_from(pq, pqfull, page);
+}
+
+static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(!mi_page_immediate_available(page));
+  mi_assert_internal(!mi_page_is_in_full(page));
+
+  if (mi_page_is_in_full(page)) return;
+  mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page);
+  _mi_page_free_collect(page,false);  // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set
+}
+
+
+// Abandon a page with used blocks at the end of a thread.
+// Note: only call if it is ensured that no references exist from
+// the `page->heap->thread_delayed_free` into this page.
+// Currently only called through `mi_heap_collect_ex` which ensures this.
+void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_heap(page) != NULL);
+
+  mi_heap_t* pheap = mi_page_heap(page);
+
+  // remove from our page list
+  mi_segments_tld_t* segments_tld = &pheap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // page is no longer associated with our heap
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_page_set_heap(page, NULL);
+
+#if MI_DEBUG>1
+  // check there are no references left..
+  for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) {
+    mi_assert_internal(_mi_ptr_page(block) != page);
+  }
+#endif
+
+  // and abandon it
+  mi_assert_internal(mi_page_heap(page) == NULL);
+  _mi_segment_page_abandon(page,segments_tld);
+}
+
+
+// Free a page with no more free blocks
+void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(pq == mi_page_queue_of(page));
+  mi_assert_internal(mi_page_all_free(page));
+  mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING);
+
+  // no more aligned blocks in here
+  mi_page_set_has_aligned(page, false);
+
+  mi_heap_t* heap = mi_page_heap(page);
+
+  // remove from the page list
+  // (no need to do _mi_heap_delayed_free first as all blocks are already free)
+  mi_segments_tld_t* segments_tld = &heap->tld->segments;
+  mi_page_queue_remove(pq, page);
+
+  // and free it
+  mi_page_set_heap(page,NULL);
+  _mi_segment_page_free(page, force, segments_tld);
+}
+
+// Retire parameters
+#define MI_MAX_RETIRE_SIZE    (MI_MEDIUM_OBJ_SIZE_MAX)
+#define MI_RETIRE_CYCLES      (8)
+
+// Retire a page with no more used blocks
+// Important to not retire too quickly though as new
+// allocations might coming.
+// Note: called from `mi_free` and benchmarks often
+// trigger this due to freeing everything and then
+// allocating again so careful when changing this.
+void _mi_page_retire(mi_page_t* page) mi_attr_noexcept {
+  mi_assert_internal(page != NULL);
+  mi_assert_expensive(_mi_page_is_valid(page));
+  mi_assert_internal(mi_page_all_free(page));
+
+  mi_page_set_has_aligned(page, false);
+
+  // don't retire too often..
+  // (or we end up retiring and re-allocating most of the time)
+  // NOTE: refine this more: we should not retire if this
+  // is the only page left with free blocks. It is not clear
+  // how to check this efficiently though...
+  // for now, we don't retire if it is the only page left of this size class.
+  mi_page_queue_t* pq = mi_page_queue_of(page);
+  if mi_likely(page->xblock_size <= MI_MAX_RETIRE_SIZE && !mi_page_queue_is_special(pq)) {  // not too large && not full or huge queue?
+    if (pq->last==page && pq->first==page) { // the only page in the queue?
+      mi_stat_counter_increase(_mi_stats_main.page_no_retire,1);
+      page->retire_expire = 1 + (page->xblock_size <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4);
+      mi_heap_t* heap = mi_page_heap(page);
+      mi_assert_internal(pq >= heap->pages);
+      const size_t index = pq - heap->pages;
+      mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE);
+      if (index < heap->page_retired_min) heap->page_retired_min = index;
+      if (index > heap->page_retired_max) heap->page_retired_max = index;
+      mi_assert_internal(mi_page_all_free(page));
+      return; // dont't free after all
+    }
+  }
+  _mi_page_free(page, pq, false);
+}
+
+// free retired pages: we don't need to look at the entire queues
+// since we only retire pages that are at the head position in a queue.
+void _mi_heap_collect_retired(mi_heap_t* heap, bool force) {
+  size_t min = MI_BIN_FULL;
+  size_t max = 0;
+  for(size_t bin = heap->page_retired_min; bin <= heap->page_retired_max; bin++) {
+    mi_page_queue_t* pq   = &heap->pages[bin];
+    mi_page_t*       page = pq->first;
+    if (page != NULL && page->retire_expire != 0) {
+      if (mi_page_all_free(page)) {
+	page->retire_expire--;
+	if (force || page->retire_expire == 0) {
+	  _mi_page_free(pq->first, pq, force);
+	}
+	else {
+	  // keep retired, update min/max
+	  if (bin < min) min = bin;
+	  if (bin > max) max = bin;
+	}
+      }
+      else {
+	page->retire_expire = 0;
+      }
+    }
+  }
+  heap->page_retired_min = min;
+  heap->page_retired_max = max;
+}
+
+
+/* -----------------------------------------------------------
+  Initialize the initial free list in a page.
+  In secure mode we initialize a randomized list by
+  alternating between slices.
+----------------------------------------------------------- */
+
+#define MI_MAX_SLICE_SHIFT  (6)   // at most 64 slices
+#define MI_MAX_SLICES       (1UL << MI_MAX_SLICE_SHIFT)
+#define MI_MIN_SLICES       (2)
+
+static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) {
+  MI_UNUSED(stats);
+  #if (MI_SECURE<=2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL);
+
+  // initialize a randomized free list
+  // set up `slice_count` slices to alternate between
+  size_t shift = MI_MAX_SLICE_SHIFT;
+  while ((extend >> shift) == 0) {
+    shift--;
+  }
+  const size_t slice_count = (size_t)1U << shift;
+  const size_t slice_extend = extend / slice_count;
+  mi_assert_internal(slice_extend >= 1);
+  mi_block_t* blocks[MI_MAX_SLICES];   // current start of the slice
+  size_t      counts[MI_MAX_SLICES];   // available objects in the slice
+  for (size_t i = 0; i < slice_count; i++) {
+    blocks[i] = mi_page_block_at(page, page_area, bsize, page->capacity + i*slice_extend);
+    counts[i] = slice_extend;
+  }
+  counts[slice_count-1] += (extend % slice_count);  // final slice holds the modulus too (todo: distribute evenly?)
+
+  // and initialize the free list by randomly threading through them
+  // set up first element
+  const uintptr_t r = _mi_heap_random_next(heap);
+  size_t current = r % slice_count;
+  counts[current]--;
+  mi_block_t* const free_start = blocks[current];
+  // and iterate through the rest; use `random_shuffle` for performance
+  uintptr_t rnd = _mi_random_shuffle(r|1); // ensure not 0
+  for (size_t i = 1; i < extend; i++) {
+    // call random_shuffle only every INTPTR_SIZE rounds
+    const size_t round = i%MI_INTPTR_SIZE;
+    if (round == 0) rnd = _mi_random_shuffle(rnd);
+    // select a random next slice index
+    size_t next = ((rnd >> 8*round) & (slice_count-1));
+    while (counts[next]==0) {                            // ensure it still has space
+      next++;
+      if (next==slice_count) next = 0;
+    }
+    // and link the current block to it
+    counts[next]--;
+    mi_block_t* const block = blocks[current];
+    blocks[current] = (mi_block_t*)((uint8_t*)block + bsize);  // bump to the following block
+    mi_block_set_next(page, block, blocks[next]);   // and set next; note: we may have `current == next`
+    current = next;
+  }
+  // prepend to the free list (usually NULL)
+  mi_block_set_next(page, blocks[current], page->free);  // end of the list
+  page->free = free_start;
+}
+
+static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats)
+{
+  MI_UNUSED(stats);
+  #if (MI_SECURE <= 2)
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->local_free == NULL);
+  #endif
+  mi_assert_internal(page->capacity + extend <= page->reserved);
+  mi_assert_internal(bsize == mi_page_block_size(page));
+  void* const page_area = _mi_page_start(_mi_page_segment(page), page, NULL );
+
+  mi_block_t* const start = mi_page_block_at(page, page_area, bsize, page->capacity);
+
+  // initialize a sequential free list
+  mi_block_t* const last = mi_page_block_at(page, page_area, bsize, page->capacity + extend - 1);
+  mi_block_t* block = start;
+  while(block <= last) {
+    mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize);
+    mi_block_set_next(page,block,next);
+    block = next;
+  }
+  // prepend to free list (usually `NULL`)
+  mi_block_set_next(page, last, page->free);
+  page->free = start;
+}
+
+/* -----------------------------------------------------------
+  Page initialize and extend the capacity
+----------------------------------------------------------- */
+
+#define MI_MAX_EXTEND_SIZE    (4*1024)      // heuristic, one OS page seems to work well.
+#if (MI_SECURE>0)
+#define MI_MIN_EXTEND         (8*MI_SECURE) // extend at least by this many
+#else
+#define MI_MIN_EXTEND         (4)
+#endif
+
+// Extend the capacity (up to reserved) by initializing a free list
+// We do at most `MI_MAX_EXTEND` to avoid touching too much memory
+// Note: we also experimented with "bump" allocation on the first
+// allocations but this did not speed up any benchmark (due to an
+// extra test in malloc? or cache effects?)
+static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) {
+  MI_UNUSED(tld);
+  mi_assert_expensive(mi_page_is_valid_init(page));
+  #if (MI_SECURE<=2)
+  mi_assert(page->free == NULL);
+  mi_assert(page->local_free == NULL);
+  if (page->free != NULL) return;
+  #endif
+  if (page->capacity >= page->reserved) return;
+
+  size_t page_size;
+  _mi_page_start(_mi_page_segment(page), page, &page_size);
+  mi_stat_counter_increase(tld->stats.pages_extended, 1);
+
+  // calculate the extend count
+  const size_t bsize = (page->xblock_size < MI_HUGE_BLOCK_SIZE ? page->xblock_size : page_size);
+  size_t extend = page->reserved - page->capacity;
+  mi_assert_internal(extend > 0);
+
+  size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/(uint32_t)bsize);
+  if (max_extend < MI_MIN_EXTEND) { max_extend = MI_MIN_EXTEND; }
+  mi_assert_internal(max_extend > 0);
+
+  if (extend > max_extend) {
+    // ensure we don't touch memory beyond the page to reduce page commit.
+    // the `lean` benchmark tests this. Going from 1 to 8 increases rss by 50%.
+    extend = max_extend;
+  }
+
+  mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved);
+  mi_assert_internal(extend < (1UL<<16));
+
+  // and append the extend the free list
+  if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) {
+    mi_page_free_list_extend(page, bsize, extend, &tld->stats );
+  }
+  else {
+    mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats);
+  }
+  // enable the new free list
+  page->capacity += (uint16_t)extend;
+  mi_stat_increase(tld->stats.page_committed, extend * bsize);
+
+  // extension into zero initialized memory preserves the zero'd free list
+  if (!page->is_zero_init) {
+    page->is_zero = false;
+  }
+  mi_assert_expensive(mi_page_is_valid_init(page));
+}
+
+// Initialize a fresh page
+static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert(segment != NULL);
+  mi_assert_internal(block_size > 0);
+  // set fields
+  mi_page_set_heap(page, heap);
+  page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE); // initialize before _mi_segment_page_start
+  size_t page_size;
+  const void* page_start = _mi_segment_page_start(segment, page, &page_size);
+  MI_UNUSED(page_start);
+  mi_track_mem_noaccess(page_start,page_size);
+  mi_assert_internal(mi_page_block_size(page) <= page_size);
+  mi_assert_internal(page_size <= page->slice_count*MI_SEGMENT_SLICE_SIZE);
+  mi_assert_internal(page_size / block_size < (1L<<16));
+  page->reserved = (uint16_t)(page_size / block_size);
+  mi_assert_internal(page->reserved > 0);
+  #ifdef MI_ENCODE_FREELIST
+  page->keys[0] = _mi_heap_random_next(heap);
+  page->keys[1] = _mi_heap_random_next(heap);
+  #endif
+  #if MI_DEBUG > 0
+  page->is_zero = false; // ensure in debug mode we initialize with MI_DEBUG_UNINIT, see issue #501
+  #else
+  page->is_zero = page->is_zero_init;
+  #endif
+
+  mi_assert_internal(page->is_committed);
+  mi_assert_internal(!page->is_reset);
+  mi_assert_internal(page->capacity == 0);
+  mi_assert_internal(page->free == NULL);
+  mi_assert_internal(page->used == 0);
+  mi_assert_internal(page->xthread_free == 0);
+  mi_assert_internal(page->next == NULL);
+  mi_assert_internal(page->prev == NULL);
+  mi_assert_internal(page->retire_expire == 0);
+  mi_assert_internal(!mi_page_has_aligned(page));
+  #if (MI_ENCODE_FREELIST)
+  mi_assert_internal(page->keys[0] != 0);
+  mi_assert_internal(page->keys[1] != 0);
+  #endif
+  mi_assert_expensive(mi_page_is_valid_init(page));
+
+  // initialize an initial free list
+  mi_page_extend_free(heap,page,tld);
+  mi_assert(mi_page_immediate_available(page));
+}
+
+
+/* -----------------------------------------------------------
+  Find pages with free blocks
+-------------------------------------------------------------*/
+
+// Find a page with free blocks of `page->block_size`.
+static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try)
+{
+  // search through the pages in "next fit" order
+  size_t count = 0;
+  mi_page_t* page = pq->first;
+  while (page != NULL)
+  {
+    mi_page_t* next = page->next; // remember next
+    count++;
+
+    // 0. collect freed blocks by us and other threads
+    _mi_page_free_collect(page, false);
+
+    // 1. if the page contains free blocks, we are done
+    if (mi_page_immediate_available(page)) {
+      break;  // pick this one
+    }
+
+    // 2. Try to extend
+    if (page->capacity < page->reserved) {
+      mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_immediate_available(page));
+      break;
+    }
+
+    // 3. If the page is completely full, move it to the `mi_pages_full`
+    // queue so we don't visit long-lived pages too often.
+    mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page));
+    mi_page_to_full(page, pq);
+
+    page = next;
+  } // for each page
+
+  mi_heap_stat_counter_increase(heap, searches, count);
+
+  if (page == NULL) {
+    _mi_heap_collect_retired(heap, false); // perhaps make a page available?
+    page = mi_page_fresh(heap, pq);
+    if (page == NULL && first_try) {
+      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
+      page = mi_page_queue_find_free_ex(heap, pq, false);
+    }
+  }
+  else {
+    mi_assert(pq->first == page);
+    page->retire_expire = 0;
+  }
+  mi_assert_internal(page == NULL || mi_page_immediate_available(page));
+  return page;
+}
+
+
+
+// Find a page with free blocks of `size`.
+static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
+  mi_page_queue_t* pq = mi_page_queue(heap,size);
+  mi_page_t* page = pq->first;
+  if (page != NULL) {
+   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
+    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
+      mi_page_extend_free(heap, page, heap->tld);
+      mi_assert_internal(mi_page_immediate_available(page));
+    }
+    else
+   #endif
+    {
+      _mi_page_free_collect(page,false);
+    }
+
+    if (mi_page_immediate_available(page)) {
+      page->retire_expire = 0;
+      return page; // fast path
+    }
+  }
+  return mi_page_queue_find_free_ex(heap, pq, true);
+}
+
+
+/* -----------------------------------------------------------
+  Users can register a deferred free function called
+  when the `free` list is empty. Since the `local_free`
+  is separate this is deterministically called after
+  a certain number of allocations.
+----------------------------------------------------------- */
+
+static mi_deferred_free_fun* volatile deferred_free = NULL;
+static _Atomic(void*) deferred_arg; // = NULL
+
+void _mi_deferred_free(mi_heap_t* heap, bool force) {
+  heap->tld->heartbeat++;
+  if (deferred_free != NULL && !heap->tld->recurse) {
+    heap->tld->recurse = true;
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
+    heap->tld->recurse = false;
+  }
+}
+
+void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
+  deferred_free = fn;
+  mi_atomic_store_ptr_release(void,&deferred_arg, arg);
+}
+
+
+/* -----------------------------------------------------------
+  General allocation
+----------------------------------------------------------- */
+
+// Large and huge page allocation.
+// Huge pages are allocated directly without being in a queue.
+// Because huge pages contain just one block, and the segment contains
+// just that page, we always treat them as abandoned and any thread
+// that frees the block can free the whole page and segment directly.
+// Huge pages are also use if the requested alignment is very large (> MI_ALIGNMENT_MAX).
+static mi_page_t* mi_large_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) {
+  size_t block_size = _mi_os_good_alloc_size(size);
+  mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0);
+  bool is_huge = (block_size > MI_LARGE_OBJ_SIZE_MAX || page_alignment > 0);
+  #if MI_HUGE_PAGE_ABANDON
+  mi_page_queue_t* pq = (is_huge ? NULL : mi_page_queue(heap, block_size));
+  #else
+  mi_page_queue_t* pq = mi_page_queue(heap, is_huge ? MI_HUGE_BLOCK_SIZE : block_size); // not block_size as that can be low if the page_alignment > 0
+  mi_assert_internal(!is_huge || mi_page_queue_is_huge(pq));
+  #endif
+  mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment);
+  if (page != NULL) {
+    mi_assert_internal(mi_page_immediate_available(page));
+
+    if (is_huge) {
+      mi_assert_internal(_mi_page_segment(page)->kind == MI_SEGMENT_HUGE);
+      mi_assert_internal(_mi_page_segment(page)->used==1);
+      #if MI_HUGE_PAGE_ABANDON
+      mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue
+      mi_page_set_heap(page, NULL);
+      #endif
+    }
+    else {
+      mi_assert_internal(_mi_page_segment(page)->kind != MI_SEGMENT_HUGE);
+    }
+
+    const size_t bsize = mi_page_usable_block_size(page);  // note: not `mi_page_block_size` to account for padding
+    if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+      mi_heap_stat_increase(heap, large, bsize);
+      mi_heap_stat_counter_increase(heap, large_count, 1);
+    }
+    else {
+      mi_heap_stat_increase(heap, huge, bsize);
+      mi_heap_stat_counter_increase(heap, huge_count, 1);
+    }
+  }
+  return page;
+}
+
+
+// Allocate a page
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
+static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept {
+  // huge allocation?
+  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+  if mi_unlikely(req_size > (MI_MEDIUM_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) {
+    if mi_unlikely(req_size > PTRDIFF_MAX) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
+      return NULL;
+    }
+    else {
+      return mi_large_huge_page_alloc(heap,size,huge_alignment);
+    }
+  }
+  else {
+    // otherwise find a page with free blocks in our size segregated queues
+    mi_assert_internal(size >= MI_PADDING_SIZE);
+    return mi_find_free_page(heap, size);
+  }
+}
+
+// Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
+// Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
+// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for
+// very large requested alignments in which case we use a huge segment.
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept
+{
+  mi_assert_internal(heap != NULL);
+
+  // initialize if necessary
+  if mi_unlikely(!mi_heap_is_initialized(heap)) {
+    mi_thread_init(); // calls `_mi_heap_init` in turn
+    heap = mi_get_default_heap();
+    if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; }
+  }
+  mi_assert_internal(mi_heap_is_initialized(heap));
+
+  // call potential deferred free routines
+  _mi_deferred_free(heap, false);
+
+  // free delayed frees from other threads (but skip contended ones)
+  _mi_heap_delayed_free_partial(heap);
+
+  // find (or allocate) a page of the right size
+  mi_page_t* page = mi_find_page(heap, size, huge_alignment);
+  if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more
+    mi_heap_collect(heap, true /* force */);
+    page = mi_find_page(heap, size, huge_alignment);
+  }
+
+  if mi_unlikely(page == NULL) { // out of memory
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`
+    _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
+    return NULL;
+  }
+
+  mi_assert_internal(mi_page_immediate_available(page));
+  mi_assert_internal(mi_page_block_size(page) >= size);
+
+  // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc)
+  if mi_unlikely(zero && page->xblock_size == 0) {
+    // note: we cannot call _mi_page_malloc with zeroing for huge blocks; we zero it afterwards in that case.
+    void* p = _mi_page_malloc(heap, page, size, false);
+    mi_assert_internal(p != NULL);
+    _mi_memzero_aligned(p, mi_page_usable_block_size(page));
+    return p;
+  }
+  else {
+    return _mi_page_malloc(heap, page, size, zero);
+  }
+}
diff --git a/compat/mimalloc/random.c b/compat/mimalloc/random.c
new file mode 100644
index 0000000000..06d4ba4ad6
--- /dev/null
+++ b/compat/mimalloc/random.c
@@ -0,0 +1,404 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
+#define _DEFAULT_SOURCE   // for syscall() on Linux
+#endif
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+#include <string.h> // memset
+
+/* ----------------------------------------------------------------------------
+We use our own PRNG to keep predictable performance of random number generation
+and to avoid implementations that use a lock. We only use the OS provided
+random source to initialize the initial seeds. Since we do not need ultimate
+performance but we do rely on the security (for secret cookies in secure mode)
+we use a cryptographically secure generator (chacha20).
+-----------------------------------------------------------------------------*/
+
+#define MI_CHACHA_ROUNDS (20)   // perhaps use 12 for better performance?
+
+
+/* ----------------------------------------------------------------------------
+Chacha20 implementation as the original algorithm with a 64-bit nonce
+and counter: https://en.wikipedia.org/wiki/Salsa20
+The input matrix has sixteen 32-bit values:
+Position  0 to  3: constant key
+Position  4 to 11: the key
+Position 12 to 13: the counter.
+Position 14 to 15: the nonce.
+
+The implementation uses regular C code which compiles very well on modern compilers.
+(gcc x64 has no register spills, and clang 6+ uses SSE instructions)
+-----------------------------------------------------------------------------*/
+
+static inline uint32_t rotl(uint32_t x, uint32_t shift) {
+  return (x << shift) | (x >> (32 - shift));
+}
+
+static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
+  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
+  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
+  x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
+  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
+}
+
+static void chacha_block(mi_random_ctx_t* ctx)
+{
+  // scramble into `x`
+  uint32_t x[16];
+  for (size_t i = 0; i < 16; i++) {
+    x[i] = ctx->input[i];
+  }
+  for (size_t i = 0; i < MI_CHACHA_ROUNDS; i += 2) {
+    qround(x, 0, 4,  8, 12);
+    qround(x, 1, 5,  9, 13);
+    qround(x, 2, 6, 10, 14);
+    qround(x, 3, 7, 11, 15);
+    qround(x, 0, 5, 10, 15);
+    qround(x, 1, 6, 11, 12);
+    qround(x, 2, 7,  8, 13);
+    qround(x, 3, 4,  9, 14);
+  }
+
+  // add scrambled data to the initial state
+  for (size_t i = 0; i < 16; i++) {
+    ctx->output[i] = x[i] + ctx->input[i];
+  }
+  ctx->output_available = 16;
+
+  // increment the counter for the next round
+  ctx->input[12] += 1;
+  if (ctx->input[12] == 0) {
+    ctx->input[13] += 1;
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
+    }
+  }
+}
+
+static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
+  if (ctx->output_available <= 0) {
+    chacha_block(ctx);
+    ctx->output_available = 16; // (assign again to suppress static analysis warning)
+  }
+  const uint32_t x = ctx->output[16 - ctx->output_available];
+  ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
+  ctx->output_available--;
+  return x;
+}
+
+static inline uint32_t read32(const uint8_t* p, size_t idx32) {
+  const size_t i = 4*idx32;
+  return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
+}
+
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
+{
+  // since we only use chacha for randomness (and not encryption) we
+  // do not _need_ to read 32-bit values as little endian but we do anyways
+  // just for being compatible :-)
+  memset(ctx, 0, sizeof(*ctx));
+  for (size_t i = 0; i < 4; i++) {
+    const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
+    ctx->input[i] = read32(sigma,i);
+  }
+  for (size_t i = 0; i < 8; i++) {
+    ctx->input[i + 4] = read32(key,i);
+  }
+  ctx->input[12] = 0;
+  ctx->input[13] = 0;
+  ctx->input[14] = (uint32_t)nonce;
+  ctx->input[15] = (uint32_t)(nonce >> 32);
+}
+
+static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
+  memset(ctx_new, 0, sizeof(*ctx_new));
+  _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
+  ctx_new->input[12] = 0;
+  ctx_new->input[13] = 0;
+  ctx_new->input[14] = (uint32_t)nonce;
+  ctx_new->input[15] = (uint32_t)(nonce >> 32);
+  mi_assert_internal(ctx->input[14] != ctx_new->input[14] || ctx->input[15] != ctx_new->input[15]); // do not reuse nonces!
+  chacha_block(ctx_new);
+}
+
+
+/* ----------------------------------------------------------------------------
+Random interface
+-----------------------------------------------------------------------------*/
+
+#if MI_DEBUG>1
+static bool mi_random_is_initialized(mi_random_ctx_t* ctx) {
+  return (ctx != NULL && ctx->input[0] != 0);
+}
+#endif
+
+void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* ctx_new) {
+  mi_assert_internal(mi_random_is_initialized(ctx));
+  mi_assert_internal(ctx != ctx_new);
+  chacha_split(ctx, (uintptr_t)ctx_new /*nonce*/, ctx_new);
+}
+
+uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {
+  mi_assert_internal(mi_random_is_initialized(ctx));
+  #if MI_INTPTR_SIZE <= 4
+    return chacha_next32(ctx);
+  #elif MI_INTPTR_SIZE == 8
+    return (((uintptr_t)chacha_next32(ctx) << 32) | chacha_next32(ctx));
+  #else
+  # error "define mi_random_next for this platform"
+  #endif
+}
+
+
+/* ----------------------------------------------------------------------------
+To initialize a fresh random context we rely on the OS:
+- Windows     : BCryptGenRandom (or RtlGenRandom)
+- macOS       : CCRandomGenerateBytes, arc4random_buf
+- bsd,wasi    : arc4random_buf
+- Linux       : getrandom,/dev/urandom
+If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
+-----------------------------------------------------------------------------*/
+
+#if defined(_WIN32)
+
+#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus)
+// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using
+// dynamic overriding, we observed it can raise an exception when compiled with C++, and
+// sometimes deadlocks when also running under the VS debugger.
+// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom.
+// To be continued..
+#pragma comment (lib,"advapi32.lib")
+#define RtlGenRandom  SystemFunction036
+#ifdef __cplusplus
+extern "C" {
+#endif
+BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+#ifdef __cplusplus
+}
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
+}
+#else
+
+#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG
+#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002
+#endif
+
+typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG);
+static  PBCryptGenRandom pBCryptGenRandom = NULL;
+
+static bool os_random_buf(void* buf, size_t buf_len) {
+  if (pBCryptGenRandom == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll"));
+    if (hDll != NULL) {
+      pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom");
+    }
+  }
+  if (pBCryptGenRandom == NULL) {
+    return false;
+  }
+  else {
+    return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
+  }
+}
+#endif
+
+#elif defined(__APPLE__)
+#include <AvailabilityMacros.h>
+#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10
+#include <CommonCrypto/CommonCryptoError.h>
+#include <CommonCrypto/CommonRandom.h>
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15
+    // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf
+    // may fail silently on macOS. See PR #390, and <https://opensource.apple.com/source/Libc/Libc-1439.40.11/gen/FreeBSD/arc4random.c.auto.html>
+    return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess);
+  #else
+    // fall back on older macOS
+    arc4random_buf(buf, buf_len);
+    return true;
+  #endif
+}
+
+#elif defined(__ANDROID__) || defined(__DragonFly__) || \
+      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
+      defined(__sun) // todo: what to use with __wasi__?
+#include <stdlib.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  arc4random_buf(buf, buf_len);
+  return true;
+}
+#elif defined(__linux__) || defined(__HAIKU__)
+#if defined(__linux__)
+#include <sys/syscall.h>
+#endif
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+static bool os_random_buf(void* buf, size_t buf_len) {
+  // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h`
+  // and for the latter the actual `getrandom` call is not always defined.
+  // (see <https://stackoverflow.com/questions/45237324/why-doesnt-getrandom-compile>)
+  // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed.
+#ifdef SYS_getrandom
+  #ifndef GRND_NONBLOCK
+  #define GRND_NONBLOCK (1)
+  #endif
+  static _Atomic(uintptr_t) no_getrandom; // = 0
+  if (mi_atomic_load_acquire(&no_getrandom)==0) {
+    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
+    if (ret >= 0) return (buf_len == (size_t)ret);
+    if (errno != ENOSYS) return false;
+    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
+  }
+#endif
+  int flags = O_RDONLY;
+  #if defined(O_CLOEXEC)
+  flags |= O_CLOEXEC;
+  #endif
+  int fd = open("/dev/urandom", flags, 0);
+  if (fd < 0) return false;
+  size_t count = 0;
+  while(count < buf_len) {
+    ssize_t ret = read(fd, (char*)buf + count, buf_len - count);
+    if (ret<=0) {
+      if (errno!=EAGAIN && errno!=EINTR) break;
+    }
+    else {
+      count += ret;
+    }
+  }
+  close(fd);
+  return (count==buf_len);
+}
+#else
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return false;
+}
+#endif
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__APPLE__)
+#include <mach/mach_time.h>
+#else
+#include <time.h>
+#endif
+
+uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
+
+  #if defined(_WIN32)
+    LARGE_INTEGER pcount;
+    QueryPerformanceCounter(&pcount);
+    x ^= (uintptr_t)(pcount.QuadPart);
+  #elif defined(__APPLE__)
+    x ^= (uintptr_t)mach_absolute_time();
+  #else
+    struct timespec time;
+    clock_gettime(CLOCK_MONOTONIC, &time);
+    x ^= (uintptr_t)time.tv_sec;
+    x ^= (uintptr_t)time.tv_nsec;
+  #endif
+  // and do a few randomization steps
+  uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
+  for (uintptr_t i = 0; i < max; i++) {
+    x = _mi_random_shuffle(x);
+  }
+  mi_assert_internal(x != 0);
+  return x;
+}
+
+static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) {
+  uint8_t key[32];
+  if (use_weak || !os_random_buf(key, sizeof(key))) {
+    // if we fail to get random data from the OS, we fall back to a
+    // weak random source based on the current time
+    #if !defined(__wasi__)
+    if (!use_weak) { _mi_warning_message("unable to use secure randomness\n"); }
+    #endif
+    uintptr_t x = _mi_os_random_weak(0);
+    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
+      x = _mi_random_shuffle(x);
+      ((uint32_t*)key)[i] = (uint32_t)x;
+    }
+    ctx->weak = true;
+  }
+  else {
+    ctx->weak = false;
+  }
+  chacha_init(ctx, key, (uintptr_t)ctx /*nonce*/ );
+}
+
+void _mi_random_init(mi_random_ctx_t* ctx) {
+  mi_random_init_ex(ctx, false);
+}
+
+void _mi_random_init_weak(mi_random_ctx_t * ctx) {
+  mi_random_init_ex(ctx, true);
+}
+
+void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx) {
+  if (ctx->weak) {
+    _mi_random_init(ctx);
+  }
+}
+
+/* --------------------------------------------------------
+test vectors from <https://tools.ietf.org/html/rfc8439>
+----------------------------------------------------------- */
+/*
+static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    if (x[i] != y[i]) return false;
+  }
+  return true;
+}
+static void chacha_test(void)
+{
+  uint32_t x[4] = { 0x11111111, 0x01020304, 0x9b8d6f43, 0x01234567 };
+  uint32_t x_out[4] = { 0xea2a92f4, 0xcb1cf8ce, 0x4581472e, 0x5881c4bb };
+  qround(x, 0, 1, 2, 3);
+  mi_assert_internal(array_equals(x, x_out, 4));
+
+  uint32_t y[16] = {
+       0x879531e0,  0xc5ecf37d,  0x516461b1,  0xc9a62f8a,
+       0x44c20ef3,  0x3390af7f,  0xd9fc690b,  0x2a5f714c,
+       0x53372767,  0xb00a5631,  0x974c541a,  0x359e9963,
+       0x5c971061,  0x3d631689,  0x2098d9d6,  0x91dbd320 };
+  uint32_t y_out[16] = {
+       0x879531e0,  0xc5ecf37d,  0xbdb886dc,  0xc9a62f8a,
+       0x44c20ef3,  0x3390af7f,  0xd9fc690b,  0xcfacafd2,
+       0xe46bea80,  0xb00a5631,  0x974c541a,  0x359e9963,
+       0x5c971061,  0xccc07c79,  0x2098d9d6,  0x91dbd320 };
+  qround(y, 2, 7, 8, 13);
+  mi_assert_internal(array_equals(y, y_out, 16));
+
+  mi_random_ctx_t r = {
+    { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574,
+      0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c,
+      0x13121110, 0x17161514, 0x1b1a1918, 0x1f1e1d1c,
+      0x00000001, 0x09000000, 0x4a000000, 0x00000000 },
+    {0},
+    0
+  };
+  uint32_t r_out[16] = {
+       0xe4e7f110, 0x15593bd1, 0x1fdd0f50, 0xc47120a3,
+       0xc7f4d1c7, 0x0368c033, 0x9aaa2204, 0x4e6cd4c3,
+       0x466482d2, 0x09aa9f07, 0x05d7c214, 0xa2028bd9,
+       0xd19c12b5, 0xb94e16de, 0xe883d0cb, 0x4e3c50a2 };
+  chacha_block(&r);
+  mi_assert_internal(array_equals(r.output, r_out, 16));
+}
+*/
diff --git a/compat/mimalloc/readme.md b/compat/mimalloc/readme.md
new file mode 100644
index 0000000000..932ac0f178
--- /dev/null
+++ b/compat/mimalloc/readme.md
@@ -0,0 +1,763 @@
+
+<img align="left" width="100" height="100" src="doc/mimalloc-logo.png"/>
+
+[<img align="right" src="https://dev.azure.com/Daan0324/mimalloc/_apis/build/status/microsoft.mimalloc?branchName=dev"/>](https://dev.azure.com/Daan0324/mimalloc/_build?definitionId=1&_a=summary)
+
+# mimalloc
+
+&nbsp;
+
+mimalloc (pronounced "me-malloc")
+is a general purpose allocator with excellent [performance](#performance) characteristics.
+Initially developed by Daan Leijen for the run-time systems of the
+[Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
+
+Latest release tag: `v2.0.9` (2022-12-23).
+Latest stable  tag: `v1.7.9` (2022-12-23).
+
+mimalloc is a drop-in replacement for `malloc` and can be used in other programs
+without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
+```
+> LD_PRELOAD=/usr/lib/libmimalloc.so  myprogram
+```
+It also has an easy way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:
+
+- __small and consistent__: the library is about 8k LOC using simple and
+  consistent data structures. This makes it very suitable
+  to integrate and adapt in other projects. For runtime systems it
+  provides hooks for a monotonic _heartbeat_ and deferred freeing (for
+  bounded worst-case times with reference counting).
+  Partly due to its simplicity, mimalloc has been ported to many systems (Windows, macOS,
+  Linux, WASM, various BSD's, Haiku, MUSL, etc) and has excellent support for dynamic overriding.
+- __free list sharding__: instead of one big free list (per size class) we have
+  many smaller lists per "mimalloc page" which reduces fragmentation and
+  increases locality --
+  things that are allocated close in time get allocated close in memory.
+  (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).
+- __free list multi-sharding__: the big idea! Not only do we shard the free list
+  per mimalloc page, but for each page we have multiple free lists. In particular, there
+  is one list for thread-local `free` operations, and another one for concurrent `free`
+  operations. Free-ing from another thread can now be a single CAS without needing
+  sophisticated coordination between threads. Since there will be
+  thousands of separate free lists, contention is naturally distributed over the heap,
+  and the chance of contending on a single location will be low -- this is quite
+  similar to randomized algorithms like skip lists where adding
+  a random oracle removes the need for a more complex algorithm.
+- __eager page reset__: when a "page" becomes empty (with increased chance
+  due to free list sharding) the memory is marked to the OS as unused (reset or decommitted)
+  reducing (real) memory pressure and fragmentation, especially in long running
+  programs.
+- __secure__: _mimalloc_ can be built in secure mode, adding guard pages,
+  randomized allocation, encrypted free lists, etc. to protect against various
+  heap vulnerabilities. The performance penalty is usually around 10% on average
+  over our benchmarks.
+- __first-class heaps__: efficiently create and use multiple heaps to allocate across different regions.
+  A heap can be destroyed at once instead of deallocating each object separately.
+- __bounded__: it does not suffer from _blowup_ \[1\], has bounded worst-case allocation
+  times (_wcat_) (upto OS primitives), bounded space overhead (~0.2% meta-data, with low
+  internal fragmentation), and has no internal points of contention using only atomic operations.
+- __fast__: In our benchmarks (see [below](#performance)),
+  _mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
+  and often uses less memory. A nice property is that it does consistently well over a wide range
+  of benchmarks. There is also good huge OS page support for larger server programs.
+
+The [documentation](https://microsoft.github.io/mimalloc) gives a full overview of the API.
+You can read more on the design of _mimalloc_ in the [technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action) which also has detailed benchmark results.
+
+Enjoy!
+
+### Branches
+
+* `master`: latest stable release (based on `dev-slice`).
+* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
+* `dev-slice`: development branch for mimalloc v2. This branch is downstream of `dev`.
+
+### Releases
+
+Note: the `v2.x` version has a new algorithm for managing internal mimalloc pages that tends to use reduce memory usage
+  and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
+  (see [below](#performance)); please report if you observe any significant performance regression.
+
+* 2022-12-23, `v1.7.9`, `v2.0.9`: Supports building with asan and improved [Valgrind] support. Support abitrary large
+  alignments (in particular for `std::pmr` pools).
+  Added C++ STL allocators attached to a specific heap (thanks @vmarkovtsev).
+  Heap walks now visit all object (including huge objects). Support Windows nano server containers (by Johannes Schindelin,@dscho).
+  Various small bug fixes.
+
+* 2022-11-03, `v1.7.7`, `v2.0.7`: Initial support for [Valgrind] for leak testing and heap block overflow detection. Initial
+  support for attaching heaps to a speficic memory area (only in v2). Fix `realloc` behavior for zero size blocks, remove restriction to integral multiple of the alignment in `alloc_align`, improved aligned allocation performance, reduced contention with many threads on few processors (thank you @dposluns!), vs2022 support, support `pkg-config`, .
+
+* 2022-04-14, `v1.7.6`, `v2.0.6`: fix fallback path for aligned OS allocation on Windows, improve Windows aligned allocation
+  even when compiling with older SDK's, fix dynamic overriding on macOS Monterey, fix MSVC C++ dynamic overriding, fix
+  warnings under Clang 14, improve performance if many OS threads are created and destroyed, fix statistics for large object
+  allocations, using MIMALLOC_VERBOSE=1 has no maximum on the number of error messages, various small fixes.
+
+* 2022-02-14, `v1.7.5`, `v2.0.5` (alpha): fix malloc override on
+  Windows 11, fix compilation with musl, potentially reduced
+  committed memory, add `bin/minject` for Windows,
+  improved wasm support, faster aligned allocation,
+  various small fixes.
+
+* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
+  M1), improved performance for v2 for large objects, Python integration improvements, more standard
+  installation directories, various small fixes.
+
+* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
+  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
+
+* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
+
+* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
+
+* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
+  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+
+* [Older release notes](#older-release-notes)
+
+Special thanks to:
+
+* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
+  mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
+* Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
+  memory model bugs using the [genMC] model checker.
+* Weipeng Liu (@pongba), Zhuowei Li, Junhua Wang, and Jakub Szymanski, for their early support of mimalloc and deployment
+  at large scale services, leading to many improvements in the mimalloc algorithms for large workloads.
+* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs
+  in (early versions of) `mimalloc`.
+* Manuel Pöter (@mpoeter) and Sam Gross(@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation. Sam also created the [no GIL](https://github.com/colesbury/nogil) Python fork which
+  uses mimalloc internally.
+
+
+[genMC]: https://plv.mpi-sws.org/genmc/
+
+### Usage
+
+mimalloc is used in various large scale low-latency services and programs, for example:
+
+<a href="https://www.bing.com"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/e/e9/Bing_logo.svg"></a>
+<a href="https://azure.microsoft.com/"><img height="50" align="left" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
+<a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.png"></a>
+<a href="https://docs.unrealengine.com/4.26/en-US/WhatsNew/Builds/ReleaseNotes/4_25/"><img height="100" src="doc/unreal-logo.svg"></a>
+<a href="https://cab.spbu.ru/software/spades/"><img height="100" src="doc/spades-logo.png"></a>
+
+
+# Building
+
+## Windows
+
+Open `ide/vs2019/mimalloc.sln` in Visual Studio 2019 and build.
+The `mimalloc` project builds a static library (in `out/msvc-x64`), while the
+`mimalloc-override` project builds a DLL for overriding malloc
+in the entire program.
+
+## macOS, Linux, BSD, etc.
+
+We use [`cmake`](https://cmake.org)<sup>1</sup> as the build system:
+
+```
+> mkdir -p out/release
+> cd out/release
+> cmake ../..
+> make
+```
+This builds the library as a shared (dynamic)
+library (`.so` or `.dylib`), a static library (`.a`), and
+as a single object file (`.o`).
+
+`> sudo make install` (install the library and header files in `/usr/local/lib`  and `/usr/local/include`)
+
+You can build the debug version which does many internal checks and
+maintains detailed statistics as:
+
+```
+> mkdir -p out/debug
+> cd out/debug
+> cmake -DCMAKE_BUILD_TYPE=Debug ../..
+> make
+```
+This will name the shared library as `libmimalloc-debug.so`.
+
+Finally, you can build a _secure_ version that uses guard pages, encrypted
+free lists, etc., as:
+```
+> mkdir -p out/secure
+> cd out/secure
+> cmake -DMI_SECURE=ON ../..
+> make
+```
+This will name the shared library as `libmimalloc-secure.so`.
+Use `ccmake`<sup>2</sup> instead of `cmake`
+to see and customize all the available build options.
+
+Notes:
+1. Install CMake: `sudo apt-get install cmake`
+2. Install CCMake: `sudo apt-get install cmake-curses-gui`
+
+
+## Single source
+
+You can also directly build the single `src/static.c` file as part of your project without
+needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path.
+
+
+# Using the library
+
+The preferred usage is including `<mimalloc.h>`, linking with
+the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example,
+```
+> gcc -o myprogram -lmimalloc myfile.c
+```
+
+mimalloc uses only safe OS calls (`mmap` and `VirtualAlloc`) and can co-exist
+with other allocators linked to the same program.
+If you use `cmake`, you can simply use:
+```
+find_package(mimalloc 1.4 REQUIRED)
+```
+in your `CMakeLists.txt` to find a locally installed mimalloc. Then use either:
+```
+target_link_libraries(myapp PUBLIC mimalloc)
+```
+to link with the shared (dynamic) library, or:
+```
+target_link_libraries(myapp PUBLIC mimalloc-static)
+```
+to link with the static library. See `test\CMakeLists.txt` for an example.
+
+For best performance in C++ programs, it is also recommended to override the
+global `new` and `delete` operators. For convience, mimalloc provides
+[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) which does this for you -- just include it in a single(!) source file in your project.
+In C++, mimalloc also provides the `mi_stl_allocator` struct which implements the `std::allocator`
+interface.
+
+You can pass environment variables to print verbose messages (`MIMALLOC_VERBOSE=1`)
+and statistics (`MIMALLOC_SHOW_STATS=1`) (in the debug version):
+```
+> env MIMALLOC_SHOW_STATS=1 ./cfrac 175451865205073170563711388363
+
+175451865205073170563711388363 = 374456281610909315237213 * 468551
+
+heap stats:     peak      total      freed       unit
+normal   2:    16.4 kb    17.5 mb    17.5 mb      16 b   ok
+normal   3:    16.3 kb    15.2 mb    15.2 mb      24 b   ok
+normal   4:      64 b      4.6 kb     4.6 kb      32 b   ok
+normal   5:      80 b    118.4 kb   118.4 kb      40 b   ok
+normal   6:      48 b       48 b       48 b       48 b   ok
+normal  17:     960 b      960 b      960 b      320 b   ok
+
+heap stats:     peak      total      freed       unit
+    normal:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
+      huge:       0 b        0 b        0 b        1 b   ok
+     total:    33.9 kb    32.8 mb    32.8 mb       1 b   ok
+malloc requested:         32.8 mb
+
+ committed:    58.2 kb    58.2 kb    58.2 kb       1 b   ok
+  reserved:     2.0 mb     2.0 mb     2.0 mb       1 b   ok
+     reset:       0 b        0 b        0 b        1 b   ok
+  segments:       1          1          1
+-abandoned:       0
+     pages:       6          6          6
+-abandoned:       0
+     mmaps:       3
+ mmap fast:       0
+ mmap slow:       1
+   threads:       0
+   elapsed:     2.022s
+   process: user: 1.781s, system: 0.016s, faults: 756, reclaims: 0, rss: 2.7 mb
+```
+
+The above model of using the `mi_` prefixed API is not always possible
+though in existing programs that already use the standard malloc interface,
+and another option is to override the standard malloc interface
+completely and redirect all calls to the _mimalloc_ library instead .
+
+## Environment Options
+
+You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
+or via environment variables:
+
+- `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
+- `MIMALLOC_VERBOSE=1`: show verbose messages.
+- `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
+- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages that are not in use, to signal to the OS
+   that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
+   programs. By setting it to `0` this will no longer be done which can improve performance for batch-like programs.
+   As an alternative, the `MIMALLOC_RESET_DELAY=`<msecs> can be set higher (100ms by default) to make the page
+   reset occur less frequently instead of turning it off completely.
+- `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
+   at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
+   the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
+   nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
+- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
+   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
+   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
+   the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
+   <!--
+   - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
+   show in the working set even though usually just a small part is committed to physical memory. This is why it
+   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
+   real drawbacks and may improve performance by a little.
+   -->
+- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
+   startup and sometimes this can give a large (latency) performance improvement on big workloads.
+   Usually it is better to not use
+   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
+   contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
+   startup only once).
+   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
+   With huge OS pages, it may be beneficial to set the setting
+   `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
+   of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
+   and allocate just a little to take up space in the huge OS page area (which cannot be reset).
+   The huge pages are usually allocated evenly among NUMA nodes.
+   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
+   the huge pages at a specific numa node instead.
+
+Use caution when using `fork` in combination with either large or huge OS pages: on a fork, the OS uses copy-on-write
+for all pages in the original process including the huge OS pages. When any memory is now written in that area, the
+OS will copy the entire 1GiB huge page (or 2MiB large page) which can cause the memory usage to grow in large increments.
+
+[linux-huge]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/5/html/tuning_and_optimizing_red_hat_enterprise_linux_for_oracle_9i_and_10g_databases/sect-oracle_9i_and_10g_tuning_guide-large_memory_optimization_big_pages_and_huge_pages-configuring_huge_pages_in_red_hat_enterprise_linux_4_or_5
+[windows-huge]: https://docs.microsoft.com/en-us/sql/database-engine/configure-windows/enable-the-lock-pages-in-memory-option-windows?view=sql-server-2017
+
+## Secure Mode
+
+_mimalloc_ can be build in secure mode by using the `-DMI_SECURE=ON` flags in `cmake`. This build enables various mitigations
+to make mimalloc more robust against exploits. In particular:
+
+- All internal mimalloc pages are surrounded by guard pages and the heap metadata is behind a guard page as well (so a buffer overflow
+  exploit cannot reach into the metadata).
+- All free list pointers are
+  [encoded](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396)
+  with per-page keys which is used both to prevent overwrites with a known pointer, as well as to detect heap corruption.
+- Double free's are detected (and ignored).
+- The free lists are initialized in a random order and allocation randomly chooses between extension and reuse within a page to
+  mitigate against attacks that rely on a predicable allocation order. Similarly, the larger heap blocks allocated by mimalloc
+  from the OS are also address randomized.
+
+As always, evaluate with care as part of an overall security strategy as all of the above are mitigations but not guarantees.
+
+## Debug Mode
+
+When _mimalloc_ is built using debug mode, various checks are done at runtime to catch development errors.
+
+- Statistics are maintained in detail for each object size. They can be shown using `MIMALLOC_SHOW_STATS=1` at runtime.
+- All objects have padding at the end to detect (byte precise) heap block overflows.
+- Double free's, and freeing invalid heap pointers are detected.
+- Corrupted free-lists and some forms of use-after-free are detected.
+
+## Valgrind
+
+Generally, we recommend using the standard allocator with the amazing [Valgrind] tool (and
+also for other address sanitizers).
+However, it is possible to build mimalloc with Valgrind support. This has a small performance
+overhead but does allow detecting memory leaks and byte-precise buffer overflows directly on final
+executables. To build with valgrind support, use the `MI_VALGRIND=ON` cmake option:
+
+```
+> cmake ../.. -DMI_VALGRIND=ON
+```
+
+This can also be combined with secure mode or debug mode.
+You can then run your programs directly under valgrind:
+
+```
+> valgrind <myprogram>
+```
+
+If you rely on overriding `malloc`/`free` by mimalloc (instead of using the `mi_malloc`/`mi_free` API directly),
+you also need to tell `valgrind` to not intercept those calls itself, and use:
+
+```
+> MIMALLOC_SHOW_STATS=1 valgrind  --soname-synonyms=somalloc=*mimalloc* -- <myprogram>
+```
+
+By setting the `MIMALLOC_SHOW_STATS` environment variable you can check that mimalloc is indeed
+used and not the standard allocator. Even though the [Valgrind option][valgrind-soname]
+is called `--soname-synonyms`, this also
+works when overriding with a static library or object file. Unfortunately, it is not possible to
+dynamically override mimalloc using `LD_PRELOAD` together with `valgrind`.
+See also the `test/test-wrong.c` file to test with `valgrind`.
+
+Valgrind support is in its initial development -- please report any issues.
+
+[Valgrind]: https://valgrind.org/
+[valgrind-soname]: https://valgrind.org/docs/manual/manual-core.html#opt.soname-synonyms
+
+
+# Overriding Standard Malloc
+
+Overriding the standard `malloc` (and `new`) can be done either _dynamically_ or _statically_.
+
+## Dynamic override
+
+This is the recommended way to override the standard malloc interface.
+
+### Override on Linux, BSD
+
+On these ELF-based systems we preload the mimalloc shared
+library so all calls to the standard `malloc` interface are
+resolved to the _mimalloc_ library.
+```
+> env LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
+```
+
+You can set extra environment variables to check that mimalloc is running,
+like:
+```
+> env MIMALLOC_VERBOSE=1 LD_PRELOAD=/usr/lib/libmimalloc.so myprogram
+```
+or run with the debug version to get detailed statistics:
+```
+> env MIMALLOC_SHOW_STATS=1 LD_PRELOAD=/usr/lib/libmimalloc-debug.so myprogram
+```
+
+### Override on MacOS
+
+On macOS we can also preload the mimalloc shared
+library so all calls to the standard `malloc` interface are
+resolved to the _mimalloc_ library.
+```
+> env DYLD_INSERT_LIBRARIES=/usr/lib/libmimalloc.dylib myprogram
+```
+
+Note that certain security restrictions may apply when doing this from
+the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
+
+
+### Override on Windows
+
+<span id="override_on_windows">Overriding on Windows</span> is robust and has the
+particular advantage to be able to redirect all malloc/free calls that go through
+the (dynamic) C runtime allocator, including those from other DLL's or libraries.
+
+The overriding on Windows requires that you link your program explicitly with
+the mimalloc DLL and use the C-runtime library as a DLL (using the `/MD` or `/MDd` switch).
+Also, the `mimalloc-redirect.dll` (or `mimalloc-redirect32.dll`) must be put
+in the same folder as the main `mimalloc-override.dll` at runtime (as it is a dependency).
+The redirection DLL ensures that all calls to the C runtime malloc API get redirected to
+mimalloc (in `mimalloc-override.dll`).
+
+To ensure the mimalloc DLL is loaded at run-time it is easiest to insert some
+call to the mimalloc API in the `main` function, like `mi_version()`
+(or use the `/INCLUDE:mi_version` switch on the linker). See the `mimalloc-override-test` project
+for an example on how to use this. For best performance on Windows with C++, it
+is also recommended to also override the `new`/`delete` operations (by including
+[`mimalloc-new-delete.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-new-delete.h) a single(!) source file in your project).
+
+The environment variable `MIMALLOC_DISABLE_REDIRECT=1` can be used to disable dynamic
+overriding at run-time. Use `MIMALLOC_VERBOSE=1` to check if mimalloc was successfully redirected.
+
+(Note: in principle, it is possible to even patch existing executables without any recompilation
+if they are linked with the dynamic C runtime (`ucrtbase.dll`) -- just put the `mimalloc-override.dll`
+into the import table (and put `mimalloc-redirect.dll` in the same folder)
+Such patching can be done for example with [CFF Explorer](https://ntcore.com/?page_id=388)).
+
+
+## Static override
+
+On Unix-like systems, you can also statically link with _mimalloc_ to override the standard
+malloc interface. The recommended way is to link the final program with the
+_mimalloc_ single object file (`mimalloc-override.o`). We use
+an object file instead of a library file as linkers give preference to
+that over archives to resolve symbols. To ensure that the standard
+malloc interface resolves to the _mimalloc_ library, link it as the first
+object file. For example:
+```
+> gcc -o myprogram mimalloc-override.o  myfile1.c ...
+```
+
+Another way to override statically that works on all platforms, is to
+link statically to mimalloc (as shown in the introduction) and include a
+header file in each source file that re-defines `malloc` etc. to `mi_malloc`.
+This is provided by [`mimalloc-override.h`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc-override.h). This only works reliably though if all sources are
+under your control or otherwise mixing of pointers from different heaps may occur!
+
+
+# Performance
+
+Last update: 2021-01-30
+
+We tested _mimalloc_ against many other top allocators over a wide
+range of benchmarks, ranging from various real world programs to
+synthetic benchmarks that see how the allocator behaves under more
+extreme circumstances. In our benchmark suite, _mimalloc_ outperforms other leading
+allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc), and has a similar memory footprint. A nice property is that it
+does consistently well over the wide range of benchmarks.
+
+General memory allocators are interesting as there exists no algorithm that is
+optimal -- for a given allocator one can usually construct a workload
+where it does not do so well. The goal is thus to find an allocation
+strategy that performs well over a wide range of benchmarks without
+suffering from (too much) underperformance in less common situations.
+
+As always, interpret these results with care since some benchmarks test synthetic
+or uncommon situations that may never apply to your workloads. For example, most
+allocators do not do well on `xmalloc-testN` but that includes even the best
+industrial allocators like _jemalloc_ and _tcmalloc_ that are used in some of
+the world's largest systems (like Chrome or FreeBSD).
+
+Also, the benchmarks here do not measure the behaviour on very large and long-running server workloads,
+or worst-case latencies of allocation. Much work has gone into `mimalloc` to work well on such
+workloads (for example, to reduce virtual memory fragmentation on long-running services)
+but such optimizations are not always reflected in the current benchmark suite.
+
+We show here only an overview -- for
+more specific details and further benchmarks we refer to the
+[technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action).
+The benchmark suite is automated and available separately
+as [mimalloc-bench](https://github.com/daanx/mimalloc-bench).
+
+
+## Benchmark Results on a 16-core AMD 5950x (Zen3)
+
+Testing on the 16-core AMD 5950x processor at 3.4Ghz (4.9Ghz boost), with
+with 32GiB memory at 3600Mhz, running	Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
+
+We measure three versions of _mimalloc_: the main version `mi` (tag:v1.7.0),
+the new v2.0 beta version as `xmi` (tag:v2.0.0), and the main version in secure mode as `smi` (tag:v1.7.0).
+
+The other allocators are
+Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (`tc`, tag:gperftools-2.8.1) used in Chrome,
+Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (`je`, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD,
+the Intel thread building blocks [allocator](https://github.com/intel/tbb) (`tbb`, tag:v2020.3),
+[rpmalloc](https://github.com/mjansson/rpmalloc) (`rp`,tag:1.4.1) by Mattias Jansson,
+the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (git:d880f72) allocator by Emery Berger \[1],
+the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:67ff31a) allocator by
+Bobby Powers _et al_ \[8],
+and finally the default system allocator (`glibc`, 2.31) (based on _PtMalloc2_).
+
+<img width="90%" src="doc/bench-2021/bench-amd5950x-2021-01-30-a.svg"/>
+<img width="90%" src="doc/bench-2021/bench-amd5950x-2021-01-30-b.svg"/>
+
+Any benchmarks ending in `N` run on all 32 logical cores in parallel.
+Results are averaged over 10 runs and reported relative
+to mimalloc (where 1.2 means it took 1.2&times; longer to run).
+The legend also contains the _overall relative score_ between the
+allocators where 100 points is the maximum if an allocator is fastest on
+all benchmarks.
+
+The single threaded _cfrac_ benchmark by Dave Barrett is an implementation of
+continued fraction factorization which uses many small short-lived allocations.
+All allocators do well on such common usage, where _mimalloc_ is just a tad
+faster than _tcmalloc_ and
+_jemalloc_.
+
+The _leanN_ program is interesting as a large realistic and
+concurrent workload of the [Lean](https://github.com/leanprover/lean)
+theorem prover compiling its own standard library, and there is a 13%
+speedup over _tcmalloc_. This is
+quite significant: if Lean spends 20% of its time in the
+allocator that means that _mimalloc_ is 1.6&times; faster than _tcmalloc_
+here. (This is surprising as that is not measured in a pure
+allocation benchmark like _alloc-test_. We conjecture that we see this
+outsized improvement here because _mimalloc_ has better locality in
+the allocation which improves performance for the *other* computations
+in a program as well).
+
+The single threaded _redis_ benchmark again show that most allocators do well on such workloads.
+
+The _larsonN_ server benchmark by Larson and Krishnan \[2] allocates and frees between threads. They observed this
+behavior (which they call _bleeding_) in actual server applications, and the benchmark simulates this.
+Here, _mimalloc_ is quite a bit faster than _tcmalloc_ and _jemalloc_ probably due to the object migration between different threads.
+
+The _mstressN_ workload performs many allocations and re-allocations,
+and migrates objects between threads (as in _larsonN_). However, it also
+creates  and destroys the _N_ worker threads a few times keeping some objects
+alive beyond the life time of the allocating thread. We observed this
+behavior in many larger server applications.
+
+The [_rptestN_](https://github.com/mjansson/rpmalloc-benchmark) benchmark
+by Mattias Jansson is a allocator test originally designed
+for _rpmalloc_, and tries to simulate realistic allocation patterns over
+multiple threads. Here the differences between allocators become more apparent.
+
+The second benchmark set tests specific aspects of the allocators and
+shows even more extreme differences between them.
+
+The _alloc-test_, by
+[OLogN Technologies AG](http://ithare.com/testing-memory-allocators-ptmalloc2-tcmalloc-hoard-jemalloc-while-trying-to-simulate-real-world-loads/), is a very allocation intensive benchmark doing millions of
+allocations in various size classes. The test is scaled such that when an
+allocator performs almost identically on _alloc-test1_ as _alloc-testN_ it
+means that it scales linearly.
+
+The _sh6bench_ and _sh8bench_ benchmarks are
+developed by [MicroQuill](http://www.microquill.com/) as part of SmartHeap.
+In _sh6bench_ _mimalloc_ does much
+better than the others (more than 2.5&times; faster than _jemalloc_).
+We cannot explain this well but believe it is
+caused in part by the "reverse" free-ing pattern in _sh6bench_.
+The _sh8bench_ is a variation with object migration
+between threads; whereas _tcmalloc_ did well on _sh6bench_, the addition of object migration causes it to be 10&times; slower than before.
+
+The _xmalloc-testN_ benchmark by Lever and Boreham \[5] and Christian Eder, simulates an asymmetric workload where
+some threads only allocate, and others only free -- they observed this pattern in
+larger server applications. Here we see that
+the _mimalloc_ technique of having non-contended sharded thread free
+lists pays off as it outperforms others by a very large margin. Only _rpmalloc_, _tbb_, and _glibc_ also scale well on this benchmark.
+
+The _cache-scratch_ benchmark by Emery Berger \[1], and introduced with
+the Hoard allocator to test for _passive-false_ sharing of cache lines.
+With a single thread they all
+perform the same, but when running with multiple threads the potential allocator
+induced false sharing of the cache lines can cause large run-time differences.
+Crundal \[6] describes in detail why the false cache line sharing occurs in the _tcmalloc_ design, and also discusses how this
+can be avoided with some small implementation changes.
+Only the _tbb_, _rpmalloc_ and _mesh_ allocators also avoid the
+cache line sharing completely, while _Hoard_ and _glibc_ seem to mitigate
+the effects. Kukanov and Voss \[7] describe in detail
+how the design of _tbb_ avoids the false cache line sharing.
+
+
+## On a 36-core Intel Xeon
+
+For completeness, here are the results on a big Amazon
+[c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized) instance
+consisting of a 2&times;18-core Intel Xeon (Cascade Lake) at 3.4GHz (boost 3.5GHz)
+with 144GiB ECC memory, running	Ubuntu 20.04 with glibc 2.31, GCC 9.3.0, and
+Clang 10.0.0. This time, the mimalloc allocators (mi, xmi, and smi) were
+compiled with the Clang compiler instead of GCC.
+The results are similar to the AMD results but it is interesting to
+see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks.
+
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-a.svg"/>
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-b.svg"/>
+
+
+## Peak Working Set
+
+The following figure shows the peak working set (rss) of the allocators
+on the benchmarks (on the c5.18xlarge instance).
+
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-a.svg"/>
+<img width="90%" src="doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-b.svg"/>
+
+Note that the _xmalloc-testN_ memory usage should be disregarded as it
+allocates more the faster the program runs. Similarly, memory usage of
+_larsonN_, _mstressN_, _rptestN_ and _sh8bench_ can vary depending on scheduling and
+speed. Nevertheless, we hope to improve the memory usage on _mstressN_
+and _rptestN_ (just as _cfrac_, _larsonN_ and _sh8bench_ have a small working set which skews the results).
+
+<!--
+# Previous Benchmarks
+
+Todo: should we create a separate page for this?
+
+## Benchmark Results on 36-core Intel: 2020-01-20
+
+Testing on a big Amazon EC2 compute instance
+([c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized))
+consisting of a 72 processor Intel Xeon at 3GHz
+with 144GiB ECC memory, running	Ubuntu 18.04.1 with glibc 2.27 and GCC 7.4.0.
+The measured allocators are _mimalloc_ (xmi, tag:v1.4.0, page reset enabled)
+and its secure build as _smi_,
+Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (tc, tag:gperftools-2.7) used in Chrome,
+Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (je, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD,
+the Intel thread building blocks [allocator](https://github.com/intel/tbb) (tbb, tag:2020),
+[rpmalloc](https://github.com/mjansson/rpmalloc) (rp,tag:1.4.0) by Mattias Jansson,
+the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (tag:3.13) allocator by Emery Berger \[1],
+the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:51222e7) allocator by
+Bobby Powers _et al_ \[8],
+and finally the default system allocator (glibc, 2.27) (based on _PtMalloc2_).
+
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-a.svg"/>
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-b.svg"/>
+
+The following figure shows the peak working set (rss) of the allocators
+on the benchmarks (on the c5.18xlarge instance).
+
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-a.svg"/>
+<img width="90%" src="doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-b.svg"/>
+
+
+## On 24-core AMD Epyc, 2020-01-16
+
+For completeness, here are the results on a
+[r5a.12xlarge](https://aws.amazon.com/ec2/instance-types/#Memory_Optimized) instance
+having a 48 processor AMD Epyc 7000 at 2.5GHz with 384GiB of memory.
+The results are similar to the Intel results but it is interesting to
+see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks.
+
+<img width="90%" src="doc/bench-2020/bench-r5a-12xlarge-2020-01-16-a.svg"/>
+<img width="90%" src="doc/bench-2020/bench-r5a-12xlarge-2020-01-16-b.svg"/>
+
+-->
+
+
+# References
+
+- \[1] Emery D. Berger, Kathryn S. McKinley, Robert D. Blumofe, and Paul R. Wilson.
+   _Hoard: A Scalable Memory Allocator for Multithreaded Applications_
+   the Ninth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-IX). Cambridge, MA, November 2000.
+   [pdf](http://www.cs.utexas.edu/users/mckinley/papers/asplos-2000.pdf)
+
+- \[2] P. Larson and M. Krishnan. _Memory allocation for long-running server applications_.
+  In ISMM, Vancouver, B.C., Canada, 1998. [pdf](http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.45.1947&rep=rep1&type=pdf)
+
+- \[3] D. Grunwald, B. Zorn, and R. Henderson.
+  _Improving the cache locality of memory allocation_. In R. Cartwright, editor,
+  Proceedings of the Conference on Programming Language Design and Implementation, pages 177–186, New York, NY, USA, June 1993. [pdf](http://citeseer.ist.psu.edu/viewdoc/download?doi=10.1.1.43.6621&rep=rep1&type=pdf)
+
+- \[4] J. Barnes and P. Hut. _A hierarchical O(n*log(n)) force-calculation algorithm_. Nature, 324:446-449, 1986.
+
+- \[5] C. Lever, and D. Boreham. _Malloc() Performance in a Multithreaded Linux Environment._
+  In USENIX Annual Technical Conference, Freenix Session. San Diego, CA. Jun. 2000.
+  Available at <https://github.com/kuszmaul/SuperMalloc/tree/master/tests>
+
+- \[6] Timothy Crundal. _Reducing Active-False Sharing in TCMalloc_. 2016. CS16S1 project at the Australian National University. [pdf](http://courses.cecs.anu.edu.au/courses/CSPROJECTS/16S1/Reports/Timothy_Crundal_Report.pdf)
+
+- \[7] Alexey Kukanov, and Michael J Voss.
+   _The Foundations for Scalable Multi-Core Software in Intel Threading Building Blocks._
+   Intel Technology Journal 11 (4). 2007
+
+- \[8] Bobby Powers, David Tench, Emery D. Berger, and Andrew McGregor.
+ _Mesh: Compacting Memory Management for C/C++_
+ In Proceedings of the 40th ACM SIGPLAN Conference on Programming Language Design and Implementation (PLDI'19), June 2019, pages 333-–346.
+
+<!--
+- \[9] Paul Liétar, Theodore Butler, Sylvan Clebsch, Sophia Drossopoulou, Juliana Franco, Matthew J Parkinson,
+  Alex Shamis, Christoph M Wintersteiger, and David Chisnall.
+  _Snmalloc: A Message Passing Allocator._
+  In Proceedings of the 2019 ACM SIGPLAN International Symposium on Memory Management, 122–135. ACM. 2019.
+-->
+
+# Contributing
+
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+
+
+# Older Release Notes
+
+* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
+  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
+* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
+  support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support.
+* 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS,
+  build PIC static libraries by default, add option to abort on out-of-memory, line buffered statistics.
+* 2020-04-20, `v1.6.2`: stable release 1.6: fix compilation on Android, MingW, Raspberry, and Conda,
+  stability fix for Windows 7, fix multiple mimalloc instances in one executable, fix `strnlen` overload,
+  fix aligned debug padding.
+* 2020-02-17, `v1.6.1`: stable release 1.6: minor updates (build with clang-cl, fix alignment issue for small objects).
+* 2020-02-09, `v1.6.0`: stable release 1.6: fixed potential memory leak, improved overriding
+  and thread local support on FreeBSD, NetBSD, DragonFly, and macOSX. New byte-precise
+  heap block overflow detection in debug mode (besides the double-free detection and free-list
+  corruption detection). Add `nodiscard` attribute to most allocation functions.
+  Enable `MIMALLOC_PAGE_RESET` by default. New reclamation strategy for abandoned heap pages
+  for better memory footprint.
+* 2020-02-09, `v1.5.0`: stable release 1.5: improved free performance, small bug fixes.
+* 2020-01-22, `v1.4.0`: stable release 1.4: improved performance for delayed OS page reset,
+more eager concurrent free, addition of STL allocator, fixed potential memory leak.
+* 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
+free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
+* 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
+* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
+* 2019-10-07, `v1.1.0`: stable release 1.1.
+* 2019-09-01, `v1.0.8`: pre-release 8: more robust windows dynamic overriding, initial huge page support.
+* 2019-08-10, `v1.0.6`: pre-release 6: various performance improvements.
diff --git a/compat/mimalloc/segment-cache.c b/compat/mimalloc/segment-cache.c
new file mode 100644
index 0000000000..7a244c3ff2
--- /dev/null
+++ b/compat/mimalloc/segment-cache.c
@@ -0,0 +1,409 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+  Implements a cache of segments to avoid expensive OS calls and to reuse
+  the commit_mask to optimize the commit/decommit calls.
+  The full memory map of all segments is also implemented here.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include "bitmap.h"  // atomic bitmap
+
+//#define MI_CACHE_DISABLE 1    // define to completely disable the segment cache
+
+#define MI_CACHE_FIELDS     (16)
+#define MI_CACHE_MAX        (MI_BITMAP_FIELD_BITS*MI_CACHE_FIELDS)       // 1024 on 64-bit
+
+#define BITS_SET()          MI_ATOMIC_VAR_INIT(UINTPTR_MAX)
+#define MI_CACHE_BITS_SET   MI_INIT16(BITS_SET)                          // note: update if MI_CACHE_FIELDS changes
+
+typedef struct mi_cache_slot_s {
+  void*               p;
+  size_t              memid;
+  bool                is_pinned;
+  mi_commit_mask_t    commit_mask;
+  mi_commit_mask_t    decommit_mask;
+  _Atomic(mi_msecs_t) expire;
+} mi_cache_slot_t;
+
+static mi_decl_cache_align mi_cache_slot_t cache[MI_CACHE_MAX];    // = 0
+
+static mi_decl_cache_align mi_bitmap_field_t cache_available[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };        // zero bit = available!
+static mi_decl_cache_align mi_bitmap_field_t cache_available_large[MI_CACHE_FIELDS] = { MI_CACHE_BITS_SET };
+static mi_decl_cache_align mi_bitmap_field_t cache_inuse[MI_CACHE_FIELDS];   // zero bit = free
+
+static bool mi_cdecl mi_segment_cache_is_suitable(mi_bitmap_index_t bitidx, void* arg) {
+  mi_arena_id_t req_arena_id = *((mi_arena_id_t*)arg);
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  return _mi_arena_memid_is_suitable(slot->memid, req_arena_id);
+}
+
+mi_decl_noinline static void* mi_segment_cache_pop_ex(
+			      bool all_suitable,
+			      size_t size, mi_commit_mask_t* commit_mask,
+			      mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero,
+			      mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+#ifdef MI_CACHE_DISABLE
+  return NULL;
+#else
+
+  // only segment blocks
+  if (size != MI_SEGMENT_SIZE) return NULL;
+
+  // numa node determines start field
+  const int numa_node = _mi_os_numa_node(tld);
+  size_t start_field = 0;
+  if (numa_node > 0) {
+    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
+    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
+  }
+
+  // find an available slot
+  mi_bitmap_index_t bitidx = 0;
+  bool claimed = false;
+  mi_arena_id_t req_arena_id = _req_arena_id;
+  mi_bitmap_pred_fun_t pred_fun = (all_suitable ? NULL : &mi_segment_cache_is_suitable);  // cannot pass NULL as the arena may be exclusive itself; todo: do not put exclusive arenas in the cache?
+
+  if (*large) {  // large allowed?
+    claimed = _mi_bitmap_try_find_from_claim_pred(cache_available_large, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
+    if (claimed) *large = true;
+  }
+  if (!claimed) {
+    claimed = _mi_bitmap_try_find_from_claim_pred (cache_available, MI_CACHE_FIELDS, start_field, 1, pred_fun, &req_arena_id, &bitidx);
+    if (claimed) *large = false;
+  }
+
+  if (!claimed) return NULL;
+
+  // found a slot
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  void* p = slot->p;
+  *memid = slot->memid;
+  *is_pinned = slot->is_pinned;
+  *is_zero = false;
+  *commit_mask = slot->commit_mask;
+  *decommit_mask = slot->decommit_mask;
+  slot->p = NULL;
+  mi_atomic_storei64_release(&slot->expire,(mi_msecs_t)0);
+
+  // mark the slot as free again
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_inuse, MI_CACHE_FIELDS, 1, bitidx));
+  _mi_bitmap_unclaim(cache_inuse, MI_CACHE_FIELDS, 1, bitidx);
+  return p;
+#endif
+}
+
+
+mi_decl_noinline void* _mi_segment_cache_pop(size_t size, mi_commit_mask_t* commit_mask, mi_commit_mask_t* decommit_mask, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t _req_arena_id, size_t* memid, mi_os_tld_t* tld)
+{
+  return mi_segment_cache_pop_ex(false, size, commit_mask, decommit_mask, large, is_pinned, is_zero, _req_arena_id, memid, tld);
+}
+
+static mi_decl_noinline void mi_commit_mask_decommit(mi_commit_mask_t* cmask, void* p, size_t total, mi_stats_t* stats)
+{
+  if (mi_commit_mask_is_empty(cmask)) {
+    // nothing
+  }
+  else if (mi_commit_mask_is_full(cmask)) {
+    _mi_os_decommit(p, total, stats);
+  }
+  else {
+    // todo: one call to decommit the whole at once?
+    mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
+    size_t part = total/MI_COMMIT_MASK_BITS;
+    size_t idx;
+    size_t count;
+    mi_commit_mask_foreach(cmask, idx, count) {
+      void*  start = (uint8_t*)p + (idx*part);
+      size_t size = count*part;
+      _mi_os_decommit(start, size, stats);
+    }
+    mi_commit_mask_foreach_end()
+  }
+  mi_commit_mask_create_empty(cmask);
+}
+
+#define MI_MAX_PURGE_PER_PUSH  (4)
+
+static mi_decl_noinline void mi_segment_cache_purge(bool visit_all, bool force, mi_os_tld_t* tld)
+{
+  MI_UNUSED(tld);
+  if (!mi_option_is_enabled(mi_option_allow_decommit)) return;
+  mi_msecs_t now = _mi_clock_now();
+  size_t purged = 0;
+  const size_t max_visits = (visit_all ? MI_CACHE_MAX /* visit all */ : MI_CACHE_FIELDS /* probe at most N (=16) slots */);
+  size_t idx              = (visit_all ? 0 : _mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX /* random start */ );
+  for (size_t visited = 0; visited < max_visits; visited++,idx++) {  // visit N slots
+    if (idx >= MI_CACHE_MAX) idx = 0; // wrap
+    mi_cache_slot_t* slot = &cache[idx];
+    mi_msecs_t expire = mi_atomic_loadi64_relaxed(&slot->expire);
+    if (expire != 0 && (force || now >= expire)) {  // racy read
+      // seems expired, first claim it from available
+      purged++;
+      mi_bitmap_index_t bitidx = mi_bitmap_index_create_from_bit(idx);
+      if (_mi_bitmap_claim(cache_available, MI_CACHE_FIELDS, 1, bitidx, NULL)) {
+	// was available, we claimed it
+	expire = mi_atomic_loadi64_acquire(&slot->expire);
+	if (expire != 0 && (force || now >= expire)) {  // safe read
+	  // still expired, decommit it
+	  mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
+	  mi_assert_internal(!mi_commit_mask_is_empty(&slot->commit_mask) && _mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
+	  _mi_abandoned_await_readers();  // wait until safe to decommit
+	  // decommit committed parts
+	  // TODO: instead of decommit, we could also free to the OS?
+	  mi_commit_mask_decommit(&slot->commit_mask, slot->p, MI_SEGMENT_SIZE, tld->stats);
+	  mi_commit_mask_create_empty(&slot->decommit_mask);
+	}
+	_mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop
+      }
+      if (!visit_all && purged > MI_MAX_PURGE_PER_PUSH) break;  // bound to no more than N purge tries per push
+    }
+  }
+}
+
+void _mi_segment_cache_collect(bool force, mi_os_tld_t* tld) {
+  if (force) {
+    // called on `mi_collect(true)` but not on thread termination
+    _mi_segment_cache_free_all(tld);
+  }
+  else {
+    mi_segment_cache_purge(true /* visit all */, false /* don't force unexpired */, tld);
+  }
+}
+
+void _mi_segment_cache_free_all(mi_os_tld_t* tld) {
+  mi_commit_mask_t commit_mask;
+  mi_commit_mask_t decommit_mask;
+  bool is_pinned;
+  bool is_zero;
+  size_t memid;
+  const size_t size = MI_SEGMENT_SIZE;
+  // iterate twice: first large pages, then regular memory
+  for (int i = 0; i < 2; i++) {
+    void* p;
+    do {
+      // keep popping and freeing the memory
+      bool large = (i == 0);
+      p = mi_segment_cache_pop_ex(true /* all */, size, &commit_mask, &decommit_mask,
+				  &large, &is_pinned, &is_zero, _mi_arena_id_none(), &memid, tld);
+      if (p != NULL) {
+	size_t csize = _mi_commit_mask_committed_size(&commit_mask, size);
+	if (csize > 0 && !is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
+	_mi_arena_free(p, size, MI_SEGMENT_ALIGN, 0, memid, is_pinned /* pretend not committed to not double count decommits */, tld->stats);
+      }
+    } while (p != NULL);
+  }
+}
+
+mi_decl_noinline bool _mi_segment_cache_push(void* start, size_t size, size_t memid, const mi_commit_mask_t* commit_mask, const mi_commit_mask_t* decommit_mask, bool is_large, bool is_pinned, mi_os_tld_t* tld)
+{
+#ifdef MI_CACHE_DISABLE
+  return false;
+#else
+
+  // only for normal segment blocks
+  if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false;
+
+  // numa node determines start field
+  int numa_node = _mi_os_numa_node(NULL);
+  size_t start_field = 0;
+  if (numa_node > 0) {
+    start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node;
+    if (start_field >= MI_CACHE_FIELDS) start_field = 0;
+  }
+
+  // purge expired entries
+  mi_segment_cache_purge(false /* limit purges to a constant N */, false /* don't force unexpired */, tld);
+
+  // find an available slot
+  mi_bitmap_index_t bitidx;
+  bool claimed = _mi_bitmap_try_find_from_claim(cache_inuse, MI_CACHE_FIELDS, start_field, 1, &bitidx);
+  if (!claimed) return false;
+
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_available, MI_CACHE_FIELDS, 1, bitidx));
+  mi_assert_internal(_mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
+#if MI_DEBUG>1
+  if (is_pinned || is_large) {
+    mi_assert_internal(mi_commit_mask_is_full(commit_mask));
+  }
+#endif
+
+  // set the slot
+  mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
+  slot->p = start;
+  slot->memid = memid;
+  slot->is_pinned = is_pinned;
+  mi_atomic_storei64_relaxed(&slot->expire,(mi_msecs_t)0);
+  slot->commit_mask = *commit_mask;
+  slot->decommit_mask = *decommit_mask;
+  if (!mi_commit_mask_is_empty(commit_mask) && !is_large && !is_pinned && mi_option_is_enabled(mi_option_allow_decommit)) {
+    long delay = mi_option_get(mi_option_segment_decommit_delay);
+    if (delay == 0) {
+      _mi_abandoned_await_readers(); // wait until safe to decommit
+      mi_commit_mask_decommit(&slot->commit_mask, start, MI_SEGMENT_SIZE, tld->stats);
+      mi_commit_mask_create_empty(&slot->decommit_mask);
+    }
+    else {
+      mi_atomic_storei64_release(&slot->expire, _mi_clock_now() + delay);
+    }
+  }
+
+  // make it available
+  _mi_bitmap_unclaim((is_large ? cache_available_large : cache_available), MI_CACHE_FIELDS, 1, bitidx);
+  return true;
+#endif
+}
+
+
+/* -----------------------------------------------------------
+  The following functions are to reliably find the segment or
+  block that encompasses any pointer p (or NULL if it is not
+  in any of our segments).
+  We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB)
+  set to 1 if it contains the segment meta data.
+----------------------------------------------------------- */
+
+
+#if (MI_INTPTR_SIZE==8)
+#define MI_MAX_ADDRESS    ((size_t)20 << 40)  // 20TB
+#else
+#define MI_MAX_ADDRESS    ((size_t)2 << 30)   // 2Gb
+#endif
+
+#define MI_SEGMENT_MAP_BITS  (MI_MAX_ADDRESS / MI_SEGMENT_SIZE)
+#define MI_SEGMENT_MAP_SIZE  (MI_SEGMENT_MAP_BITS / 8)
+#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE)
+
+static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1];  // 2KiB per TB with 64MiB segments
+
+static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) {
+  mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE?
+  if ((uintptr_t)segment >= MI_MAX_ADDRESS) {
+    *bitidx = 0;
+    return MI_SEGMENT_MAP_WSIZE;
+  }
+  else {
+    const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE;
+    *bitidx = segindex % MI_INTPTR_BITS;
+    const size_t mapindex = segindex / MI_INTPTR_BITS;
+    mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE);
+    return mapindex;
+  }
+}
+
+void _mi_segment_map_allocated_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index==MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask | ((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+void _mi_segment_map_freed_at(const mi_segment_t* segment) {
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE);
+  if (index == MI_SEGMENT_MAP_WSIZE) return;
+  uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  uintptr_t newmask;
+  do {
+    newmask = (mask & ~((uintptr_t)1 << bitidx));
+  } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask));
+}
+
+// Determine the segment belonging to a pointer or NULL if it is not in a valid segment.
+static mi_segment_t* _mi_segment_of(const void* p) {
+  if (p == NULL) return NULL;
+  mi_segment_t* segment = _mi_ptr_segment(p);
+  mi_assert_internal(segment != NULL);
+  size_t bitidx;
+  size_t index = mi_segment_map_index_of(segment, &bitidx);
+  // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge
+  const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]);
+  if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) {
+    return segment; // yes, allocated by us
+  }
+  if (index==MI_SEGMENT_MAP_WSIZE) return NULL;
+
+  // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers?
+
+  // search downwards for the first segment in case it is an interior pointer
+  // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough
+  // valid huge objects
+  // note: we could maintain a lowest index to speed up the path for invalid pointers?
+  size_t lobitidx;
+  size_t loindex;
+  uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1);
+  if (lobits != 0) {
+    loindex = index;
+    lobitidx = mi_bsr(lobits);    // lobits != 0
+  }
+  else if (index == 0) {
+    return NULL;
+  }
+  else {
+    mi_assert_internal(index > 0);
+    uintptr_t lomask = mask;
+    loindex = index;
+    do {
+      loindex--;
+      lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]);
+    } while (lomask != 0 && loindex > 0);
+    if (lomask == 0) return NULL;
+    lobitidx = mi_bsr(lomask);    // lomask != 0
+  }
+  mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE);
+  // take difference as the addresses could be larger than the MAX_ADDRESS space.
+  size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE;
+  segment = (mi_segment_t*)((uint8_t*)segment - diff);
+
+  if (segment == NULL) return NULL;
+  mi_assert_internal((void*)segment < p);
+  bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(cookie_ok);
+  if mi_unlikely(!cookie_ok) return NULL;
+  if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range
+  mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment));
+  return segment;
+}
+
+// Is this a valid pointer in our heap?
+static bool  mi_is_valid_pointer(const void* p) {
+  return (_mi_segment_of(p) != NULL);
+}
+
+mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
+  return mi_is_valid_pointer(p);
+}
+
+/*
+// Return the full segment range belonging to a pointer
+static void* mi_segment_range_of(const void* p, size_t* size) {
+  mi_segment_t* segment = _mi_segment_of(p);
+  if (segment == NULL) {
+    if (size != NULL) *size = 0;
+    return NULL;
+  }
+  else {
+    if (size != NULL) *size = segment->segment_size;
+    return segment;
+  }
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size);
+  mi_reset_delayed(tld);
+  mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld));
+  return page;
+}
+*/
diff --git a/compat/mimalloc/segment.c b/compat/mimalloc/segment.c
new file mode 100644
index 0000000000..85158ece49
--- /dev/null
+++ b/compat/mimalloc/segment.c
@@ -0,0 +1,1621 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+#include <stdio.h>
+
+#define MI_PAGE_HUGE_ALIGN  (256*1024)
+
+static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats);
+
+
+// -------------------------------------------------------------------
+// commit mask
+// -------------------------------------------------------------------
+
+static bool mi_commit_mask_all_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if ((commit->mask[i] & cm->mask[i]) != cm->mask[i]) return false;
+  }
+  return true;
+}
+
+static bool mi_commit_mask_any_set(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    if ((commit->mask[i] & cm->mask[i]) != 0) return true;
+  }
+  return false;
+}
+
+static void mi_commit_mask_create_intersect(const mi_commit_mask_t* commit, const mi_commit_mask_t* cm, mi_commit_mask_t* res) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] = (commit->mask[i] & cm->mask[i]);
+  }
+}
+
+static void mi_commit_mask_clear(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] &= ~(cm->mask[i]);
+  }
+}
+
+static void mi_commit_mask_set(mi_commit_mask_t* res, const mi_commit_mask_t* cm) {
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    res->mask[i] |= cm->mask[i];
+  }
+}
+
+static void mi_commit_mask_create(size_t bitidx, size_t bitcount, mi_commit_mask_t* cm) {
+  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
+  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
+  if (bitcount == MI_COMMIT_MASK_BITS) {
+    mi_assert_internal(bitidx==0);
+    mi_commit_mask_create_full(cm);
+  }
+  else if (bitcount == 0) {
+    mi_commit_mask_create_empty(cm);
+  }
+  else {
+    mi_commit_mask_create_empty(cm);
+    size_t i = bitidx / MI_COMMIT_MASK_FIELD_BITS;
+    size_t ofs = bitidx % MI_COMMIT_MASK_FIELD_BITS;
+    while (bitcount > 0) {
+      mi_assert_internal(i < MI_COMMIT_MASK_FIELD_COUNT);
+      size_t avail = MI_COMMIT_MASK_FIELD_BITS - ofs;
+      size_t count = (bitcount > avail ? avail : bitcount);
+      size_t mask = (count >= MI_COMMIT_MASK_FIELD_BITS ? ~((size_t)0) : (((size_t)1 << count) - 1) << ofs);
+      cm->mask[i] = mask;
+      bitcount -= count;
+      ofs = 0;
+      i++;
+    }
+  }
+}
+
+size_t _mi_commit_mask_committed_size(const mi_commit_mask_t* cm, size_t total) {
+  mi_assert_internal((total%MI_COMMIT_MASK_BITS)==0);
+  size_t count = 0;
+  for (size_t i = 0; i < MI_COMMIT_MASK_FIELD_COUNT; i++) {
+    size_t mask = cm->mask[i];
+    if (~mask == 0) {
+      count += MI_COMMIT_MASK_FIELD_BITS;
+    }
+    else {
+      for (; mask != 0; mask >>= 1) {  // todo: use popcount
+	if ((mask&1)!=0) count++;
+      }
+    }
+  }
+  // we use total since for huge segments each commit bit may represent a larger size
+  return ((total / MI_COMMIT_MASK_BITS) * count);
+}
+
+
+size_t _mi_commit_mask_next_run(const mi_commit_mask_t* cm, size_t* idx) {
+  size_t i = (*idx) / MI_COMMIT_MASK_FIELD_BITS;
+  size_t ofs = (*idx) % MI_COMMIT_MASK_FIELD_BITS;
+  size_t mask = 0;
+  // find first ones
+  while (i < MI_COMMIT_MASK_FIELD_COUNT) {
+    mask = cm->mask[i];
+    mask >>= ofs;
+    if (mask != 0) {
+      while ((mask&1) == 0) {
+	mask >>= 1;
+	ofs++;
+      }
+      break;
+    }
+    i++;
+    ofs = 0;
+  }
+  if (i >= MI_COMMIT_MASK_FIELD_COUNT) {
+    // not found
+    *idx = MI_COMMIT_MASK_BITS;
+    return 0;
+  }
+  else {
+    // found, count ones
+    size_t count = 0;
+    *idx = (i*MI_COMMIT_MASK_FIELD_BITS) + ofs;
+    do {
+      mi_assert_internal(ofs < MI_COMMIT_MASK_FIELD_BITS && (mask&1) == 1);
+      do {
+	count++;
+	mask >>= 1;
+      } while ((mask&1) == 1);
+      if ((((*idx + count) % MI_COMMIT_MASK_FIELD_BITS) == 0)) {
+	i++;
+	if (i >= MI_COMMIT_MASK_FIELD_COUNT) break;
+	mask = cm->mask[i];
+	ofs = 0;
+      }
+    } while ((mask&1) == 1);
+    mi_assert_internal(count > 0);
+    return count;
+  }
+}
+
+
+/* --------------------------------------------------------------------------------
+  Segment allocation
+
+  If a  thread ends, it "abandons" pages with used blocks
+  and there is an abandoned segment list whose segments can
+  be reclaimed by still running threads, much like work-stealing.
+-------------------------------------------------------------------------------- */
+
+
+/* -----------------------------------------------------------
+   Slices
+----------------------------------------------------------- */
+
+
+static const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment) {
+  return &segment->slices[segment->slice_entries];
+}
+
+static uint8_t* mi_slice_start(const mi_slice_t* slice) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  mi_assert_internal(slice >= segment->slices && slice < mi_segment_slices_end(segment));
+  return ((uint8_t*)segment + ((slice - segment->slices)*MI_SEGMENT_SLICE_SIZE));
+}
+
+
+/* -----------------------------------------------------------
+   Bins
+----------------------------------------------------------- */
+// Use bit scan forward to quickly find the first zero bit if it is available
+
+static inline size_t mi_slice_bin8(size_t slice_count) {
+  if (slice_count<=1) return slice_count;
+  mi_assert_internal(slice_count <= MI_SLICES_PER_SEGMENT);
+  slice_count--;
+  size_t s = mi_bsr(slice_count);  // slice_count > 1
+  if (s <= 2) return slice_count + 1;
+  size_t bin = ((s << 2) | ((slice_count >> (s - 2))&0x03)) - 4;
+  return bin;
+}
+
+static inline size_t mi_slice_bin(size_t slice_count) {
+  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_SEGMENT_SIZE);
+  mi_assert_internal(mi_slice_bin8(MI_SLICES_PER_SEGMENT) <= MI_SEGMENT_BIN_MAX);
+  size_t bin = mi_slice_bin8(slice_count);
+  mi_assert_internal(bin <= MI_SEGMENT_BIN_MAX);
+  return bin;
+}
+
+static inline size_t mi_slice_index(const mi_slice_t* slice) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  ptrdiff_t index = slice - segment->slices;
+  mi_assert_internal(index >= 0 && index < (ptrdiff_t)segment->slice_entries);
+  return index;
+}
+
+
+/* -----------------------------------------------------------
+   Slice span queues
+----------------------------------------------------------- */
+
+static void mi_span_queue_push(mi_span_queue_t* sq, mi_slice_t* slice) {
+  // todo: or push to the end?
+  mi_assert_internal(slice->prev == NULL && slice->next==NULL);
+  slice->prev = NULL; // paranoia
+  slice->next = sq->first;
+  sq->first = slice;
+  if (slice->next != NULL) slice->next->prev = slice;
+		     else sq->last = slice;
+  slice->xblock_size = 0; // free
+}
+
+static mi_span_queue_t* mi_span_queue_for(size_t slice_count, mi_segments_tld_t* tld) {
+  size_t bin = mi_slice_bin(slice_count);
+  mi_span_queue_t* sq = &tld->spans[bin];
+  mi_assert_internal(sq->slice_count >= slice_count);
+  return sq;
+}
+
+static void mi_span_queue_delete(mi_span_queue_t* sq, mi_slice_t* slice) {
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  // should work too if the queue does not contain slice (which can happen during reclaim)
+  if (slice->prev != NULL) slice->prev->next = slice->next;
+  if (slice == sq->first) sq->first = slice->next;
+  if (slice->next != NULL) slice->next->prev = slice->prev;
+  if (slice == sq->last) sq->last = slice->prev;
+  slice->prev = NULL;
+  slice->next = NULL;
+  slice->xblock_size = 1; // no more free
+}
+
+
+/* -----------------------------------------------------------
+ Invariant checking
+----------------------------------------------------------- */
+
+static bool mi_slice_is_used(const mi_slice_t* slice) {
+  return (slice->xblock_size > 0);
+}
+
+
+#if (MI_DEBUG>=3)
+static bool mi_span_queue_contains(mi_span_queue_t* sq, mi_slice_t* slice) {
+  for (mi_slice_t* s = sq->first; s != NULL; s = s->next) {
+    if (s==slice) return true;
+  }
+  return false;
+}
+
+static bool mi_segment_is_valid(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  mi_assert_internal(segment->thread_id == 0 || segment->thread_id == _mi_thread_id());
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask)); // can only decommit committed blocks
+  //mi_assert_internal(segment->segment_info_size % MI_SEGMENT_SLICE_SIZE == 0);
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  size_t used_count = 0;
+  mi_span_queue_t* sq;
+  while(slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    size_t index = mi_slice_index(slice);
+    size_t maxindex = (index + slice->slice_count >= segment->slice_entries ? segment->slice_entries : index + slice->slice_count) - 1;
+    if (mi_slice_is_used(slice)) { // a page in use, we need at least MAX_SLICE_OFFSET valid back offsets
+      used_count++;
+      for (size_t i = 0; i <= MI_MAX_SLICE_OFFSET && index + i <= maxindex; i++) {
+	mi_assert_internal(segment->slices[index + i].slice_offset == i*sizeof(mi_slice_t));
+	mi_assert_internal(i==0 || segment->slices[index + i].slice_count == 0);
+	mi_assert_internal(i==0 || segment->slices[index + i].xblock_size == 1);
+      }
+      // and the last entry as well (for coalescing)
+      const mi_slice_t* last = slice + slice->slice_count - 1;
+      if (last > slice && last < mi_segment_slices_end(segment)) {
+	mi_assert_internal(last->slice_offset == (slice->slice_count-1)*sizeof(mi_slice_t));
+	mi_assert_internal(last->slice_count == 0);
+	mi_assert_internal(last->xblock_size == 1);
+      }
+    }
+    else {  // free range of slices; only last slice needs a valid back offset
+      mi_slice_t* last = &segment->slices[maxindex];
+      if (segment->kind != MI_SEGMENT_HUGE || slice->slice_count <= (segment->slice_entries - segment->segment_info_slices)) {
+	mi_assert_internal((uint8_t*)slice == (uint8_t*)last - last->slice_offset);
+      }
+      mi_assert_internal(slice == last || last->slice_count == 0 );
+      mi_assert_internal(last->xblock_size == 0 || (segment->kind==MI_SEGMENT_HUGE && last->xblock_size==1));
+      if (segment->kind != MI_SEGMENT_HUGE && segment->thread_id != 0) { // segment is not huge or abandoned
+	sq = mi_span_queue_for(slice->slice_count,tld);
+	mi_assert_internal(mi_span_queue_contains(sq,slice));
+      }
+    }
+    slice = &segment->slices[maxindex+1];
+  }
+  mi_assert_internal(slice == end);
+  mi_assert_internal(used_count == segment->used + 1);
+  return true;
+}
+#endif
+
+/* -----------------------------------------------------------
+ Segment size calculations
+----------------------------------------------------------- */
+
+static size_t mi_segment_info_size(mi_segment_t* segment) {
+  return segment->segment_info_slices * MI_SEGMENT_SLICE_SIZE;
+}
+
+static uint8_t* _mi_segment_page_start_from_slice(const mi_segment_t* segment, const mi_slice_t* slice, size_t xblock_size, size_t* page_size)
+{
+  ptrdiff_t idx = slice - segment->slices;
+  size_t psize = (size_t)slice->slice_count * MI_SEGMENT_SLICE_SIZE;
+  // make the start not OS page aligned for smaller blocks to avoid page/cache effects
+  size_t start_offset = (xblock_size >= MI_INTPTR_SIZE && xblock_size <= 1024 ? 3*MI_MAX_ALIGN_GUARANTEE : 0);
+  if (page_size != NULL) { *page_size = psize - start_offset; }
+  return (uint8_t*)segment + ((idx*MI_SEGMENT_SLICE_SIZE) + start_offset);
+}
+
+// Start of the page available memory; can be used on uninitialized pages
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size)
+{
+  const mi_slice_t* slice = mi_page_to_slice((mi_page_t*)page);
+  uint8_t* p = _mi_segment_page_start_from_slice(segment, slice, page->xblock_size, page_size);
+  mi_assert_internal(page->xblock_size > 0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+
+static size_t mi_segment_calculate_slices(size_t required, size_t* pre_size, size_t* info_slices) {
+  size_t page_size = _mi_os_page_size();
+  size_t isize     = _mi_align_up(sizeof(mi_segment_t), page_size);
+  size_t guardsize = 0;
+
+  if (MI_SECURE>0) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data (and one at the end of the segment)
+    guardsize = page_size;
+    if (required > 0) {
+      required = _mi_align_up(required, MI_SEGMENT_SLICE_SIZE) + page_size;
+    }
+  }
+
+  if (pre_size != NULL) *pre_size = isize;
+  isize = _mi_align_up(isize + guardsize, MI_SEGMENT_SLICE_SIZE);
+  if (info_slices != NULL) *info_slices = isize / MI_SEGMENT_SLICE_SIZE;
+  size_t segment_size = (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + guardsize, MI_SEGMENT_SLICE_SIZE) );
+  mi_assert_internal(segment_size % MI_SEGMENT_SLICE_SIZE == 0);
+  return (segment_size / MI_SEGMENT_SLICE_SIZE);
+}
+
+
+/* ----------------------------------------------------------------------------
+Segment caches
+We keep a small segment cache per thread to increase local
+reuse and avoid setting/clearing guard pages in secure mode.
+------------------------------------------------------------------------------- */
+
+static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) {
+  if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1);
+		  else _mi_stat_decrease(&tld->stats->segments,1);
+  tld->count += (segment_size >= 0 ? 1 : -1);
+  if (tld->count > tld->peak_count) tld->peak_count = tld->count;
+  tld->current_size += segment_size;
+  if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size;
+}
+
+static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  segment->thread_id = 0;
+  _mi_segment_map_freed_at(segment);
+  mi_segments_track_size(-((long)mi_segment_size(segment)),tld);
+  if (MI_SECURE>0) {
+    // _mi_os_unprotect(segment, mi_segment_size(segment)); // ensure no more guard pages are set
+    // unprotect the guard pages; we cannot just unprotect the whole segment size as part may be decommitted
+    size_t os_pagesize = _mi_os_page_size();
+    _mi_os_unprotect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
+    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
+    _mi_os_unprotect(end, os_pagesize);
+  }
+
+  // purge delayed decommits now? (no, leave it to the cache)
+  // mi_segment_delayed_decommit(segment,true,tld->stats);
+
+  // _mi_os_free(segment, mi_segment_size(segment), /*segment->memid,*/ tld->stats);
+  const size_t size = mi_segment_size(segment);
+  if (size != MI_SEGMENT_SIZE || segment->mem_align_offset != 0 || segment->kind == MI_SEGMENT_HUGE || // only push regular segments on the cache
+       !_mi_segment_cache_push(segment, size, segment->memid, &segment->commit_mask, &segment->decommit_mask, segment->mem_is_large, segment->mem_is_pinned, tld->os))
+  {
+    const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size);
+    if (csize > 0 && !segment->mem_is_pinned) _mi_stat_decrease(&_mi_stats_main.committed, csize);
+    _mi_abandoned_await_readers();  // wait until safe to free
+    _mi_arena_free(segment, mi_segment_size(segment), segment->mem_alignment, segment->mem_align_offset, segment->memid, segment->mem_is_pinned /* pretend not committed to not double count decommits */, tld->stats);
+  }
+}
+
+// called by threads that are terminating
+void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
+  MI_UNUSED(tld);
+  // nothing to do
+}
+
+
+/* -----------------------------------------------------------
+   Commit/Decommit ranges
+----------------------------------------------------------- */
+
+static void mi_segment_commit_mask(mi_segment_t* segment, bool conservative, uint8_t* p, size_t size, uint8_t** start_p, size_t* full_size, mi_commit_mask_t* cm) {
+  mi_assert_internal(_mi_ptr_segment(p + 1) == segment);
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  mi_commit_mask_create_empty(cm);
+  if (size == 0 || size > MI_SEGMENT_SIZE || segment->kind == MI_SEGMENT_HUGE) return;
+  const size_t segstart = mi_segment_info_size(segment);
+  const size_t segsize = mi_segment_size(segment);
+  if (p >= (uint8_t*)segment + segsize) return;
+
+  size_t pstart = (p - (uint8_t*)segment);
+  mi_assert_internal(pstart + size <= segsize);
+
+  size_t start;
+  size_t end;
+  if (conservative) {
+    // decommit conservative
+    start = _mi_align_up(pstart, MI_COMMIT_SIZE);
+    end   = _mi_align_down(pstart + size, MI_COMMIT_SIZE);
+    mi_assert_internal(start >= segstart);
+    mi_assert_internal(end <= segsize);
+  }
+  else {
+    // commit liberal
+    start = _mi_align_down(pstart, MI_MINIMAL_COMMIT_SIZE);
+    end   = _mi_align_up(pstart + size, MI_MINIMAL_COMMIT_SIZE);
+  }
+  if (pstart >= segstart && start < segstart) {  // note: the mask is also calculated for an initial commit of the info area
+    start = segstart;
+  }
+  if (end > segsize) {
+    end = segsize;
+  }
+
+  mi_assert_internal(start <= pstart && (pstart + size) <= end);
+  mi_assert_internal(start % MI_COMMIT_SIZE==0 && end % MI_COMMIT_SIZE == 0);
+  *start_p   = (uint8_t*)segment + start;
+  *full_size = (end > start ? end - start : 0);
+  if (*full_size == 0) return;
+
+  size_t bitidx = start / MI_COMMIT_SIZE;
+  mi_assert_internal(bitidx < MI_COMMIT_MASK_BITS);
+
+  size_t bitcount = *full_size / MI_COMMIT_SIZE; // can be 0
+  if (bitidx + bitcount > MI_COMMIT_MASK_BITS) {
+    _mi_warning_message("commit mask overflow: idx=%zu count=%zu start=%zx end=%zx p=0x%p size=%zu fullsize=%zu\n", bitidx, bitcount, start, end, p, size, *full_size);
+  }
+  mi_assert_internal((bitidx + bitcount) <= MI_COMMIT_MASK_BITS);
+  mi_commit_mask_create(bitidx, bitcount, cm);
+}
+
+
+static bool mi_segment_commitx(mi_segment_t* segment, bool commit, uint8_t* p, size_t size, mi_stats_t* stats) {
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+
+  // commit liberal, but decommit conservative
+  uint8_t* start = NULL;
+  size_t   full_size = 0;
+  mi_commit_mask_t mask;
+  mi_segment_commit_mask(segment, !commit/*conservative*/, p, size, &start, &full_size, &mask);
+  if (mi_commit_mask_is_empty(&mask) || full_size==0) return true;
+
+  if (commit && !mi_commit_mask_all_set(&segment->commit_mask, &mask)) {
+    bool is_zero = false;
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+    _mi_stat_decrease(&_mi_stats_main.committed, _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
+    if (!_mi_os_commit(start,full_size,&is_zero,stats)) return false;
+    mi_commit_mask_set(&segment->commit_mask, &mask);
+  }
+  else if (!commit && mi_commit_mask_any_set(&segment->commit_mask, &mask)) {
+    mi_assert_internal((void*)start != (void*)segment);
+    //mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &mask));
+
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);
+    _mi_stat_increase(&_mi_stats_main.committed, full_size - _mi_commit_mask_committed_size(&cmask, MI_SEGMENT_SIZE)); // adjust for overlap
+    if (segment->allow_decommit) {
+      _mi_os_decommit(start, full_size, stats); // ok if this fails
+    }
+    mi_commit_mask_clear(&segment->commit_mask, &mask);
+  }
+  // increase expiration of reusing part of the delayed decommit
+  if (commit && mi_commit_mask_any_set(&segment->decommit_mask, &mask)) {
+    segment->decommit_expire = _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+  }
+  // always undo delayed decommits
+  mi_commit_mask_clear(&segment->decommit_mask, &mask);
+  return true;
+}
+
+static bool mi_segment_ensure_committed(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+  // note: assumes commit_mask is always full for huge segments as otherwise the commit mask bits can overflow
+  if (mi_commit_mask_is_full(&segment->commit_mask) && mi_commit_mask_is_empty(&segment->decommit_mask)) return true; // fully committed
+  return mi_segment_commitx(segment,true,p,size,stats);
+}
+
+static void mi_segment_perhaps_decommit(mi_segment_t* segment, uint8_t* p, size_t size, mi_stats_t* stats) {
+  if (!segment->allow_decommit) return;
+  if (mi_option_get(mi_option_decommit_delay) == 0) {
+    mi_segment_commitx(segment, false, p, size, stats);
+  }
+  else {
+    // register for future decommit in the decommit mask
+    uint8_t* start = NULL;
+    size_t   full_size = 0;
+    mi_commit_mask_t mask;
+    mi_segment_commit_mask(segment, true /*conservative*/, p, size, &start, &full_size, &mask);
+    if (mi_commit_mask_is_empty(&mask) || full_size==0) return;
+
+    // update delayed commit
+    mi_assert_internal(segment->decommit_expire > 0 || mi_commit_mask_is_empty(&segment->decommit_mask));
+    mi_commit_mask_t cmask;
+    mi_commit_mask_create_intersect(&segment->commit_mask, &mask, &cmask);  // only decommit what is committed; span_free may try to decommit more
+    mi_commit_mask_set(&segment->decommit_mask, &cmask);
+    mi_msecs_t now = _mi_clock_now();
+    if (segment->decommit_expire == 0) {
+      // no previous decommits, initialize now
+      segment->decommit_expire = now + mi_option_get(mi_option_decommit_delay);
+    }
+    else if (segment->decommit_expire <= now) {
+      // previous decommit mask already expired
+      if (segment->decommit_expire + mi_option_get(mi_option_decommit_extend_delay) <= now) {
+	mi_segment_delayed_decommit(segment, true, stats);
+      }
+      else {
+	segment->decommit_expire = now + mi_option_get(mi_option_decommit_extend_delay); // (mi_option_get(mi_option_decommit_delay) / 8); // wait a tiny bit longer in case there is a series of free's
+      }
+    }
+    else {
+      // previous decommit mask is not yet expired, increase the expiration by a bit.
+      segment->decommit_expire += mi_option_get(mi_option_decommit_extend_delay);
+    }
+  }
+}
+
+static void mi_segment_delayed_decommit(mi_segment_t* segment, bool force, mi_stats_t* stats) {
+  if (!segment->allow_decommit || mi_commit_mask_is_empty(&segment->decommit_mask)) return;
+  mi_msecs_t now = _mi_clock_now();
+  if (!force && now < segment->decommit_expire) return;
+
+  mi_commit_mask_t mask = segment->decommit_mask;
+  segment->decommit_expire = 0;
+  mi_commit_mask_create_empty(&segment->decommit_mask);
+
+  size_t idx;
+  size_t count;
+  mi_commit_mask_foreach(&mask, idx, count) {
+    // if found, decommit that sequence
+    if (count > 0) {
+      uint8_t* p = (uint8_t*)segment + (idx*MI_COMMIT_SIZE);
+      size_t size = count * MI_COMMIT_SIZE;
+      mi_segment_commitx(segment, false, p, size, stats);
+    }
+  }
+  mi_commit_mask_foreach_end()
+  mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+}
+
+
+/* -----------------------------------------------------------
+   Span free
+----------------------------------------------------------- */
+
+static bool mi_segment_is_abandoned(mi_segment_t* segment) {
+  return (segment->thread_id == 0);
+}
+
+// note: can be called on abandoned segments
+static void mi_segment_span_free(mi_segment_t* segment, size_t slice_index, size_t slice_count, bool allow_decommit, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_index < segment->slice_entries);
+  mi_span_queue_t* sq = (segment->kind == MI_SEGMENT_HUGE || mi_segment_is_abandoned(segment)
+			  ? NULL : mi_span_queue_for(slice_count,tld));
+  if (slice_count==0) slice_count = 1;
+  mi_assert_internal(slice_index + slice_count - 1 < segment->slice_entries);
+
+  // set first and last slice (the intermediates can be undetermined)
+  mi_slice_t* slice = &segment->slices[slice_index];
+  slice->slice_count = (uint32_t)slice_count;
+  mi_assert_internal(slice->slice_count == slice_count); // no overflow?
+  slice->slice_offset = 0;
+  if (slice_count > 1) {
+    mi_slice_t* last = &segment->slices[slice_index + slice_count - 1];
+    last->slice_count = 0;
+    last->slice_offset = (uint32_t)(sizeof(mi_page_t)*(slice_count - 1));
+    last->xblock_size = 0;
+  }
+
+  // perhaps decommit
+  if (allow_decommit) {
+    mi_segment_perhaps_decommit(segment, mi_slice_start(slice), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats);
+  }
+
+  // and push it on the free page queue (if it was not a huge page)
+  if (sq != NULL) mi_span_queue_push( sq, slice );
+	     else slice->xblock_size = 0; // mark huge page as free anyways
+}
+
+/*
+// called from reclaim to add existing free spans
+static void mi_segment_span_add_free(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  mi_assert_internal(slice->xblock_size==0 && slice->slice_count>0 && slice->slice_offset==0);
+  size_t slice_index = mi_slice_index(slice);
+  mi_segment_span_free(segment,slice_index,slice->slice_count,tld);
+}
+*/
+
+static void mi_segment_span_remove_from_queue(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice->slice_count > 0 && slice->slice_offset==0 && slice->xblock_size==0);
+  mi_assert_internal(_mi_ptr_segment(slice)->kind != MI_SEGMENT_HUGE);
+  mi_span_queue_t* sq = mi_span_queue_for(slice->slice_count, tld);
+  mi_span_queue_delete(sq, slice);
+}
+
+// note: can be called on abandoned segments
+static mi_slice_t* mi_segment_span_free_coalesce(mi_slice_t* slice, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice != NULL && slice->slice_count > 0 && slice->slice_offset == 0);
+  mi_segment_t* segment = _mi_ptr_segment(slice);
+  bool is_abandoned = mi_segment_is_abandoned(segment);
+
+  // for huge pages, just mark as free but don't add to the queues
+  if (segment->kind == MI_SEGMENT_HUGE) {
+    mi_assert_internal(segment->used == 1);  // decreased right after this call in `mi_segment_page_clear`
+    slice->xblock_size = 0;  // mark as free anyways
+    // we should mark the last slice `xblock_size=0` now to maintain invariants but we skip it to
+    // avoid a possible cache miss (and the segment is about to be freed)
+    return slice;
+  }
+
+  // otherwise coalesce the span and add to the free span queues
+  size_t slice_count = slice->slice_count;
+  mi_slice_t* next = slice + slice->slice_count;
+  mi_assert_internal(next <= mi_segment_slices_end(segment));
+  if (next < mi_segment_slices_end(segment) && next->xblock_size==0) {
+    // free next block -- remove it from free and merge
+    mi_assert_internal(next->slice_count > 0 && next->slice_offset==0);
+    slice_count += next->slice_count; // extend
+    if (!is_abandoned) { mi_segment_span_remove_from_queue(next, tld); }
+  }
+  if (slice > segment->slices) {
+    mi_slice_t* prev = mi_slice_first(slice - 1);
+    mi_assert_internal(prev >= segment->slices);
+    if (prev->xblock_size==0) {
+      // free previous slice -- remove it from free and merge
+      mi_assert_internal(prev->slice_count > 0 && prev->slice_offset==0);
+      slice_count += prev->slice_count;
+      if (!is_abandoned) { mi_segment_span_remove_from_queue(prev, tld); }
+      slice = prev;
+    }
+  }
+
+  // and add the new free page
+  mi_segment_span_free(segment, mi_slice_index(slice), slice_count, true, tld);
+  return slice;
+}
+
+
+
+/* -----------------------------------------------------------
+   Page allocation
+----------------------------------------------------------- */
+
+// Note: may still return NULL if committing the memory failed
+static mi_page_t* mi_segment_span_allocate(mi_segment_t* segment, size_t slice_index, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_index < segment->slice_entries);
+  mi_slice_t* const slice = &segment->slices[slice_index];
+  mi_assert_internal(slice->xblock_size==0 || slice->xblock_size==1);
+
+  // commit before changing the slice data
+  if (!mi_segment_ensure_committed(segment, _mi_segment_page_start_from_slice(segment, slice, 0, NULL), slice_count * MI_SEGMENT_SLICE_SIZE, tld->stats)) {
+    return NULL;  // commit failed!
+  }
+
+  // convert the slices to a page
+  slice->slice_offset = 0;
+  slice->slice_count = (uint32_t)slice_count;
+  mi_assert_internal(slice->slice_count == slice_count);
+  const size_t bsize = slice_count * MI_SEGMENT_SLICE_SIZE;
+  slice->xblock_size = (uint32_t)(bsize >= MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : bsize);
+  mi_page_t*  page = mi_slice_to_page(slice);
+  mi_assert_internal(mi_page_block_size(page) == bsize);
+
+  // set slice back pointers for the first MI_MAX_SLICE_OFFSET entries
+  size_t extra = slice_count-1;
+  if (extra > MI_MAX_SLICE_OFFSET) extra = MI_MAX_SLICE_OFFSET;
+  if (slice_index + extra >= segment->slice_entries) extra = segment->slice_entries - slice_index - 1;  // huge objects may have more slices than avaiable entries in the segment->slices
+
+  mi_slice_t* slice_next = slice + 1;
+  for (size_t i = 1; i <= extra; i++, slice_next++) {
+    slice_next->slice_offset = (uint32_t)(sizeof(mi_slice_t)*i);
+    slice_next->slice_count = 0;
+    slice_next->xblock_size = 1;
+  }
+
+  // and also for the last one (if not set already) (the last one is needed for coalescing and for large alignments)
+  // note: the cast is needed for ubsan since the index can be larger than MI_SLICES_PER_SEGMENT for huge allocations (see #543)
+  mi_slice_t* last = slice + slice_count - 1;
+  mi_slice_t* end = (mi_slice_t*)mi_segment_slices_end(segment);
+  if (last > end) last = end;
+  if (last > slice) {
+    last->slice_offset = (uint32_t)(sizeof(mi_slice_t) * (last - slice));
+    last->slice_count = 0;
+    last->xblock_size = 1;
+  }
+
+  // and initialize the page
+  page->is_reset = false;
+  page->is_committed = true;
+  segment->used++;
+  return page;
+}
+
+static void mi_segment_slice_split(mi_segment_t* segment, mi_slice_t* slice, size_t slice_count, mi_segments_tld_t* tld) {
+  mi_assert_internal(_mi_ptr_segment(slice) == segment);
+  mi_assert_internal(slice->slice_count >= slice_count);
+  mi_assert_internal(slice->xblock_size > 0); // no more in free queue
+  if (slice->slice_count <= slice_count) return;
+  mi_assert_internal(segment->kind != MI_SEGMENT_HUGE);
+  size_t next_index = mi_slice_index(slice) + slice_count;
+  size_t next_count = slice->slice_count - slice_count;
+  mi_segment_span_free(segment, next_index, next_count, false /* don't decommit left-over part */, tld);
+  slice->slice_count = (uint32_t)slice_count;
+}
+
+static mi_page_t* mi_segments_page_find_and_allocate(size_t slice_count, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld) {
+  mi_assert_internal(slice_count*MI_SEGMENT_SLICE_SIZE <= MI_LARGE_OBJ_SIZE_MAX);
+  // search from best fit up
+  mi_span_queue_t* sq = mi_span_queue_for(slice_count, tld);
+  if (slice_count == 0) slice_count = 1;
+  while (sq <= &tld->spans[MI_SEGMENT_BIN_MAX]) {
+    for (mi_slice_t* slice = sq->first; slice != NULL; slice = slice->next) {
+      if (slice->slice_count >= slice_count) {
+	// found one
+	mi_segment_t* segment = _mi_ptr_segment(slice);
+	if (_mi_arena_memid_is_suitable(segment->memid, req_arena_id)) {
+	  // found a suitable page span
+	  mi_span_queue_delete(sq, slice);
+
+	  if (slice->slice_count > slice_count) {
+	    mi_segment_slice_split(segment, slice, slice_count, tld);
+	  }
+	  mi_assert_internal(slice != NULL && slice->slice_count == slice_count && slice->xblock_size > 0);
+	  mi_page_t* page = mi_segment_span_allocate(segment, mi_slice_index(slice), slice->slice_count, tld);
+	  if (page == NULL) {
+	    // commit failed; return NULL but first restore the slice
+	    mi_segment_span_free_coalesce(slice, tld);
+	    return NULL;
+	  }
+	  return page;
+	}
+      }
+    }
+    sq++;
+  }
+  // could not find a page..
+  return NULL;
+}
+
+
+/* -----------------------------------------------------------
+   Segment allocation
+----------------------------------------------------------- */
+
+static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment, bool eager_delay, mi_arena_id_t req_arena_id,
+					  size_t* psegment_slices, size_t* ppre_size, size_t* pinfo_slices,
+					  mi_commit_mask_t* pcommit_mask, mi_commit_mask_t* pdecommit_mask,
+					  bool* is_zero, bool* pcommit, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+
+{
+  // Allocate the segment from the OS
+  bool mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy
+  bool is_pinned = false;
+  size_t memid = 0;
+  size_t align_offset = 0;
+  size_t alignment = MI_SEGMENT_ALIGN;
+
+  if (page_alignment > 0) {
+    // mi_assert_internal(huge_page != NULL);
+    mi_assert_internal(page_alignment >= MI_SEGMENT_ALIGN);
+    alignment = page_alignment;
+    const size_t info_size = (*pinfo_slices) * MI_SEGMENT_SLICE_SIZE;
+    align_offset = _mi_align_up( info_size, MI_SEGMENT_ALIGN );
+    const size_t extra = align_offset - info_size;
+    // recalculate due to potential guard pages
+    *psegment_slices = mi_segment_calculate_slices(required + extra, ppre_size, pinfo_slices);
+    //segment_size += _mi_align_up(align_offset - info_size, MI_SEGMENT_SLICE_SIZE);
+    //segment_slices = segment_size / MI_SEGMENT_SLICE_SIZE;
+  }
+  const size_t segment_size = (*psegment_slices) * MI_SEGMENT_SLICE_SIZE;
+  mi_segment_t* segment = NULL;
+
+  // get from cache?
+  if (page_alignment == 0) {
+    segment = (mi_segment_t*)_mi_segment_cache_pop(segment_size, pcommit_mask, pdecommit_mask, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+  }
+
+  // get from OS
+  if (segment==NULL) {
+    segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, pcommit, &mem_large, &is_pinned, is_zero, req_arena_id, &memid, os_tld);
+    if (segment == NULL) return NULL;  // failed to allocate
+    if (*pcommit) {
+      mi_commit_mask_create_full(pcommit_mask);
+    }
+    else {
+      mi_commit_mask_create_empty(pcommit_mask);
+    }
+  }
+  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
+
+  const size_t commit_needed = _mi_divide_up((*pinfo_slices)*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+  mi_assert_internal(commit_needed>0);
+  mi_commit_mask_t commit_needed_mask;
+  mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+  if (!mi_commit_mask_all_set(pcommit_mask, &commit_needed_mask)) {
+    // at least commit the info slices
+    mi_assert_internal(commit_needed*MI_COMMIT_SIZE >= (*pinfo_slices)*MI_SEGMENT_SLICE_SIZE);
+    bool ok = _mi_os_commit(segment, commit_needed*MI_COMMIT_SIZE, is_zero, tld->stats);
+    if (!ok) return NULL; // failed to commit
+    mi_commit_mask_set(pcommit_mask, &commit_needed_mask);
+  }
+  mi_track_mem_undefined(segment,commit_needed);
+  segment->memid = memid;
+  segment->mem_is_pinned = is_pinned;
+  segment->mem_is_large = mem_large;
+  segment->mem_is_committed = mi_commit_mask_is_full(pcommit_mask);
+  segment->mem_alignment = alignment;
+  segment->mem_align_offset = align_offset;
+  mi_segments_track_size((long)(segment_size), tld);
+  _mi_segment_map_allocated_at(segment);
+  return segment;
+}
+
+
+// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` .
+static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld, mi_page_t** huge_page)
+{
+  mi_assert_internal((required==0 && huge_page==NULL) || (required>0 && huge_page != NULL));
+
+  // calculate needed sizes first
+  size_t info_slices;
+  size_t pre_size;
+  size_t segment_slices = mi_segment_calculate_slices(required, &pre_size, &info_slices);
+
+  // Commit eagerly only if not the first N lazy segments (to reduce impact of many threads that allocate just a little)
+  const bool eager_delay = (// !_mi_os_has_overcommit() &&             // never delay on overcommit systems
+			    _mi_current_thread_count() > 1 &&       // do not delay for the first N threads
+			    tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  const bool eager = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
+  bool commit = eager || (required > 0);
+  bool is_zero = false;
+
+  mi_commit_mask_t commit_mask;
+  mi_commit_mask_t decommit_mask;
+  mi_commit_mask_create_empty(&commit_mask);
+  mi_commit_mask_create_empty(&decommit_mask);
+
+  // Allocate the segment from the OS
+  mi_segment_t* segment = mi_segment_os_alloc(required, page_alignment, eager_delay, req_arena_id,
+					      &segment_slices, &pre_size, &info_slices, &commit_mask, &decommit_mask,
+					      &is_zero, &commit, tld, os_tld);
+  if (segment == NULL) return NULL;
+
+  // zero the segment info? -- not always needed as it may be zero initialized from the OS
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
+  if (!is_zero) {
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
+    memset((uint8_t*)segment+ofs, 0, prefix + sizeof(mi_slice_t)*(segment_slices+1));  // one more
+  }
+
+  segment->commit_mask = commit_mask; // on lazy commit, the initial part is always committed
+  segment->allow_decommit = (mi_option_is_enabled(mi_option_allow_decommit) && !segment->mem_is_pinned && !segment->mem_is_large);
+  if (segment->allow_decommit) {
+    segment->decommit_expire = 0; // don't decommit just committed memory // _mi_clock_now() + mi_option_get(mi_option_decommit_delay);
+    segment->decommit_mask = decommit_mask;
+    mi_assert_internal(mi_commit_mask_all_set(&segment->commit_mask, &segment->decommit_mask));
+    #if MI_DEBUG>2
+    const size_t commit_needed = _mi_divide_up(info_slices*MI_SEGMENT_SLICE_SIZE, MI_COMMIT_SIZE);
+    mi_commit_mask_t commit_needed_mask;
+    mi_commit_mask_create(0, commit_needed, &commit_needed_mask);
+    mi_assert_internal(!mi_commit_mask_any_set(&segment->decommit_mask, &commit_needed_mask));
+    #endif
+  }
+
+  // initialize segment info
+  const size_t slice_entries = (segment_slices > MI_SLICES_PER_SEGMENT ? MI_SLICES_PER_SEGMENT : segment_slices);
+  segment->segment_slices = segment_slices;
+  segment->segment_info_slices = info_slices;
+  segment->thread_id = _mi_thread_id();
+  segment->cookie = _mi_ptr_cookie(segment);
+  segment->slice_entries = slice_entries;
+  segment->kind = (required == 0 ? MI_SEGMENT_NORMAL : MI_SEGMENT_HUGE);
+
+  // memset(segment->slices, 0, sizeof(mi_slice_t)*(info_slices+1));
+  _mi_stat_increase(&tld->stats->page_committed, mi_segment_info_size(segment));
+
+  // set up guard pages
+  size_t guard_slices = 0;
+  if (MI_SECURE>0) {
+    // in secure mode, we set up a protected page in between the segment info
+    // and the page data, and at the end of the segment.
+    size_t os_pagesize = _mi_os_page_size();
+    mi_assert_internal(mi_segment_info_size(segment) - os_pagesize >= pre_size);
+    _mi_os_protect((uint8_t*)segment + mi_segment_info_size(segment) - os_pagesize, os_pagesize);
+    uint8_t* end = (uint8_t*)segment + mi_segment_size(segment) - os_pagesize;
+    mi_segment_ensure_committed(segment, end, os_pagesize, tld->stats);
+    _mi_os_protect(end, os_pagesize);
+    if (slice_entries == segment_slices) segment->slice_entries--; // don't use the last slice :-(
+    guard_slices = 1;
+  }
+
+  // reserve first slices for segment info
+  mi_page_t* page0 = mi_segment_span_allocate(segment, 0, info_slices, tld);
+  mi_assert_internal(page0!=NULL); if (page0==NULL) return NULL; // cannot fail as we always commit in advance
+  mi_assert_internal(segment->used == 1);
+  segment->used = 0; // don't count our internal slices towards usage
+
+  // initialize initial free pages
+  if (segment->kind == MI_SEGMENT_NORMAL) { // not a huge page
+    mi_assert_internal(huge_page==NULL);
+    mi_segment_span_free(segment, info_slices, segment->slice_entries - info_slices, false /* don't decommit */, tld);
+  }
+  else {
+    mi_assert_internal(huge_page!=NULL);
+    mi_assert_internal(mi_commit_mask_is_empty(&segment->decommit_mask));
+    mi_assert_internal(mi_commit_mask_is_full(&segment->commit_mask));
+    *huge_page = mi_segment_span_allocate(segment, info_slices, segment_slices - info_slices - guard_slices, tld);
+    mi_assert_internal(*huge_page != NULL); // cannot fail as we commit in advance
+  }
+
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  return segment;
+}
+
+
+static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
+  MI_UNUSED(force);
+  mi_assert_internal(segment != NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used == 0);
+
+  // Remove the free pages
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  size_t page_count = 0;
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    mi_assert_internal(mi_slice_index(slice)==0 || slice->xblock_size == 0); // no more used pages ..
+    if (slice->xblock_size == 0 && segment->kind != MI_SEGMENT_HUGE) {
+      mi_segment_span_remove_from_queue(slice, tld);
+    }
+    page_count++;
+    slice = slice + slice->slice_count;
+  }
+  mi_assert_internal(page_count == 2); // first page is allocated by the segment itself
+
+  // stats
+  _mi_stat_decrease(&tld->stats->page_committed, mi_segment_info_size(segment));
+
+  // return it to the OS
+  mi_segment_os_free(segment, tld);
+}
+
+
+/* -----------------------------------------------------------
+   Page Free
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);
+
+// note: can be called on abandoned pages
+static mi_slice_t* mi_segment_page_clear(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert_internal(page->xblock_size > 0);
+  mi_assert_internal(mi_page_all_free(page));
+  mi_segment_t* segment = _mi_ptr_segment(page);
+  mi_assert_internal(segment->used > 0);
+
+  size_t inuse = page->capacity * mi_page_block_size(page);
+  _mi_stat_decrease(&tld->stats->page_committed, inuse);
+  _mi_stat_decrease(&tld->stats->pages, 1);
+
+  // reset the page memory to reduce memory pressure?
+  if (!segment->mem_is_pinned && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
+    size_t psize;
+    uint8_t* start = _mi_page_start(segment, page, &psize);
+    page->is_reset = true;
+    _mi_os_reset(start, psize, tld->stats);
+  }
+
+  // zero the page data, but not the segment fields
+  page->is_zero_init = false;
+  ptrdiff_t ofs = offsetof(mi_page_t, capacity);
+  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
+  page->xblock_size = 1;
+
+  // and free it
+  mi_slice_t* slice = mi_segment_span_free_coalesce(mi_page_to_slice(page), tld);
+  segment->used--;
+  // cannot assert segment valid as it is called during reclaim
+  // mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  return slice;
+}
+
+void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
+{
+  mi_assert(page != NULL);
+
+  mi_segment_t* segment = _mi_page_segment(page);
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+
+  // mark it as free now
+  mi_segment_page_clear(page, tld);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+
+  if (segment->used == 0) {
+    // no more used pages; remove from the free list and free the segment
+    mi_segment_free(segment, force, tld);
+  }
+  else if (segment->used == segment->abandoned) {
+    // only abandoned pages; remove from free list and abandon
+    mi_segment_abandon(segment,tld);
+  }
+}
+
+
+/* -----------------------------------------------------------
+Abandonment
+
+When threads terminate, they can leave segments with
+live blocks (reachable through other threads). Such segments
+are "abandoned" and will be reclaimed by other threads to
+reuse their pages and/or free them eventually
+
+We maintain a global list of abandoned segments that are
+reclaimed on demand. Since this is shared among threads
+the implementation needs to avoid the A-B-A problem on
+popping abandoned segments: <https://en.wikipedia.org/wiki/ABA_problem>
+We use tagged pointers to avoid accidentially identifying
+reused segments, much like stamped references in Java.
+Secondly, we maintain a reader counter to avoid resetting
+or decommitting segments that have a pending read operation.
+
+Note: the current implementation is one possible design;
+another way might be to keep track of abandoned segments
+in the arenas/segment_cache's. This would have the advantage of keeping
+all concurrent code in one place and not needing to deal
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
+would be spread among all other segments in the arenas.
+----------------------------------------------------------- */
+
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
+// to put in a tag that increments on update to avoid the A-B-A problem.
+#define MI_TAGGED_MASK   MI_SEGMENT_MASK
+typedef uintptr_t        mi_tagged_segment_t;
+
+static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) {
+  return (mi_segment_t*)(ts & ~MI_TAGGED_MASK);
+}
+
+static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_segment_t ts) {
+  mi_assert_internal(((uintptr_t)segment & MI_TAGGED_MASK) == 0);
+  uintptr_t tag = ((ts & MI_TAGGED_MASK) + 1) & MI_TAGGED_MASK;
+  return ((uintptr_t)segment | tag);
+}
+
+// This is a list of visited abandoned pages that were full at the time.
+// this list migrates to `abandoned` when that becomes NULL. The use of
+// this list reduces contention and the rate at which segments are visited.
+static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
+
+// The abandoned page list (tagged as it supports pop)
+static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+
+// Maintain these for debug purposes (these counts may be a bit off)
+static mi_decl_cache_align _Atomic(size_t)           abandoned_count;
+static mi_decl_cache_align _Atomic(size_t)           abandoned_visited_count;
+
+// We also maintain a count of current readers of the abandoned list
+// in order to prevent resetting/decommitting segment memory if it might
+// still be read.
+static mi_decl_cache_align _Atomic(size_t)           abandoned_readers; // = 0
+
+// Push on the visited list
+static void mi_abandoned_visited_push(mi_segment_t* segment) {
+  mi_assert_internal(segment->thread_id == 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used > 0);
+  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
+  do {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
+  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
+  mi_atomic_increment_relaxed(&abandoned_visited_count);
+}
+
+// Move the visited list to the abandoned list.
+static bool mi_abandoned_visited_revisit(void)
+{
+  // quick check if the visited list is empty
+  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;
+
+  // grab the whole visited list
+  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
+  if (first == NULL) return false;
+
+  // first try to swap directly if the abandoned list happens to be NULL
+  mi_tagged_segment_t afirst;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  if (mi_tagged_segment_ptr(ts)==NULL) {
+    size_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    afirst = mi_tagged_segment(first, ts);
+    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
+      mi_atomic_add_relaxed(&abandoned_count, count);
+      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+      return true;
+    }
+  }
+
+  // find the last element of the visited list: O(n)
+  mi_segment_t* last = first;
+  mi_segment_t* next;
+  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
+    last = next;
+  }
+
+  // and atomically prepend to the abandoned list
+  // (no need to increase the readers as we don't access the abandoned segments)
+  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
+  size_t count;
+  do {
+    count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
+    afirst = mi_tagged_segment(first, anext);
+  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
+  mi_atomic_add_relaxed(&abandoned_count, count);
+  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+  return true;
+}
+
+// Push on the abandoned list.
+static void mi_abandoned_push(mi_segment_t* segment) {
+  mi_assert_internal(segment->thread_id == 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->used > 0);
+  mi_tagged_segment_t next;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  do {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
+    next = mi_tagged_segment(segment, ts);
+  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
+  mi_atomic_increment_relaxed(&abandoned_count);
+}
+
+// Wait until there are no more pending reads on segments that used to be in the abandoned list
+// called for example from `arena.c` before decommitting
+void _mi_abandoned_await_readers(void) {
+  size_t n;
+  do {
+    n = mi_atomic_load_acquire(&abandoned_readers);
+    if (n != 0) mi_atomic_yield();
+  } while (n != 0);
+}
+
+// Pop from the abandoned list
+static mi_segment_t* mi_abandoned_pop(void) {
+  mi_segment_t* segment;
+  // Check efficiently if it is empty (or if the visited list needs to be moved)
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
+  segment = mi_tagged_segment_ptr(ts);
+  if mi_likely(segment == NULL) {
+    if mi_likely(!mi_abandoned_visited_revisit()) { // try to swap in the visited list on NULL
+      return NULL;
+    }
+  }
+
+  // Do a pop. We use a reader count to prevent
+  // a segment to be decommitted while a read is still pending,
+  // and a tagged pointer to prevent A-B-A link corruption.
+  // (this is called from `region.c:_mi_mem_free` for example)
+  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
+  mi_tagged_segment_t next = 0;
+  ts = mi_atomic_load_acquire(&abandoned);
+  do {
+    segment = mi_tagged_segment_ptr(ts);
+    if (segment != NULL) {
+      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
+      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
+    }
+  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
+  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
+  if (segment != NULL) {
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+    mi_atomic_decrement_relaxed(&abandoned_count);
+  }
+  return segment;
+}
+
+/* -----------------------------------------------------------
+   Abandon segment/page
+----------------------------------------------------------- */
+
+static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
+  mi_assert_internal(segment->used == segment->abandoned);
+  mi_assert_internal(segment->used > 0);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_internal(segment->abandoned_visits == 0);
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+
+  // remove the free pages from the free page queues
+  mi_slice_t* slice = &segment->slices[0];
+  const mi_slice_t* end = mi_segment_slices_end(segment);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (slice->xblock_size == 0) { // a free page
+      mi_segment_span_remove_from_queue(slice,tld);
+      slice->xblock_size = 0; // but keep it free
+    }
+    slice = slice + slice->slice_count;
+  }
+
+  // perform delayed decommits
+  mi_segment_delayed_decommit(segment, mi_option_is_enabled(mi_option_abandoned_page_decommit) /* force? */, tld->stats);
+
+  // all pages in the segment are abandoned; add it to the abandoned list
+  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
+  mi_segments_track_size(-((long)mi_segment_size(segment)), tld);
+  segment->thread_id = 0;
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+  segment->abandoned_visits = 1;   // from 0 to 1 to signify it is abandoned
+  mi_abandoned_push(segment);
+}
+
+void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
+  mi_assert(page != NULL);
+  mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+  mi_assert_internal(mi_page_heap(page) == NULL);
+  mi_segment_t* segment = _mi_page_segment(page);
+
+  mi_assert_expensive(mi_segment_is_valid(segment,tld));
+  segment->abandoned++;
+
+  _mi_stat_increase(&tld->stats->pages_abandoned, 1);
+  mi_assert_internal(segment->abandoned <= segment->used);
+  if (segment->used == segment->abandoned) {
+    // all pages are abandoned, abandon the entire segment
+    mi_segment_abandon(segment, tld);
+  }
+}
+
+/* -----------------------------------------------------------
+  Reclaim abandoned pages
+----------------------------------------------------------- */
+
+static mi_slice_t* mi_slices_start_iterate(mi_segment_t* segment, const mi_slice_t** end) {
+  mi_slice_t* slice = &segment->slices[0];
+  *end = mi_segment_slices_end(segment);
+  mi_assert_internal(slice->slice_count>0 && slice->xblock_size>0); // segment allocated page
+  slice = slice + slice->slice_count; // skip the first segment allocated page
+  return slice;
+}
+
+// Possibly free pages and check if free space is available
+static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, size_t block_size, mi_segments_tld_t* tld)
+{
+  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(mi_segment_is_abandoned(segment));
+  bool has_page = false;
+
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) { // used page
+      // ensure used count is up to date and collect potential concurrent frees
+      mi_page_t* const page = mi_slice_to_page(slice);
+      _mi_page_free_collect(page, false);
+      if (mi_page_all_free(page)) {
+	// if this page is all free now, free it without adding to any queues (yet)
+	mi_assert_internal(page->next == NULL && page->prev==NULL);
+	_mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+	segment->abandoned--;
+	slice = mi_segment_page_clear(page, tld); // re-assign slice due to coalesce!
+	mi_assert_internal(!mi_slice_is_used(slice));
+	if (slice->slice_count >= slices_needed) {
+	  has_page = true;
+	}
+      }
+      else {
+	if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
+	  // a page has available free blocks of the right size
+	  has_page = true;
+	}
+      }
+    }
+    else {
+      // empty span
+      if (slice->slice_count >= slices_needed) {
+	has_page = true;
+      }
+    }
+    slice = slice + slice->slice_count;
+  }
+  return has_page;
+}
+
+// Reclaim an abandoned segment; returns NULL if the segment was freed
+// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
+static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
+  mi_assert_expensive(mi_segment_is_valid(segment, tld));
+  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }
+
+  segment->thread_id = _mi_thread_id();
+  segment->abandoned_visits = 0;
+  mi_segments_track_size((long)mi_segment_size(segment), tld);
+  mi_assert_internal(segment->next == NULL);
+  _mi_stat_decrease(&tld->stats->segments_abandoned, 1);
+
+  // for all slices
+  const mi_slice_t* end;
+  mi_slice_t* slice = mi_slices_start_iterate(segment, &end);
+  while (slice < end) {
+    mi_assert_internal(slice->slice_count > 0);
+    mi_assert_internal(slice->slice_offset == 0);
+    if (mi_slice_is_used(slice)) {
+      // in use: reclaim the page in our heap
+      mi_page_t* page = mi_slice_to_page(slice);
+      mi_assert_internal(!page->is_reset);
+      mi_assert_internal(page->is_committed);
+      mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE);
+      mi_assert_internal(mi_page_heap(page) == NULL);
+      mi_assert_internal(page->next == NULL && page->prev==NULL);
+      _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
+      segment->abandoned--;
+      // set the heap again and allow delayed free again
+      mi_page_set_heap(page, heap);
+      _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set)
+      _mi_page_free_collect(page, false); // ensure used count is up to date
+      if (mi_page_all_free(page)) {
+	// if everything free by now, free the page
+	slice = mi_segment_page_clear(page, tld);   // set slice again due to coalesceing
+      }
+      else {
+	// otherwise reclaim it into the heap
+	_mi_page_reclaim(heap, page);
+	if (requested_block_size == page->xblock_size && mi_page_has_any_available(page)) {
+	  if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
+	}
+      }
+    }
+    else {
+      // the span is free, add it to our page queues
+      slice = mi_segment_span_free_coalesce(slice, tld); // set slice again due to coalesceing
+    }
+    mi_assert_internal(slice->slice_count>0 && slice->slice_offset==0);
+    slice = slice + slice->slice_count;
+  }
+
+  mi_assert(segment->abandoned == 0);
+  if (segment->used == 0) {  // due to page_clear
+    mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed));
+    mi_segment_free(segment, false, tld);
+    return NULL;
+  }
+  else {
+    return segment;
+  }
+}
+
+
+void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) {
+  mi_segment_t* segment;
+  while ((segment = mi_abandoned_pop()) != NULL) {
+    mi_segment_reclaim(segment, heap, 0, NULL, tld);
+  }
+}
+
+static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slices, size_t block_size, bool* reclaimed, mi_segments_tld_t* tld)
+{
+  *reclaimed = false;
+  mi_segment_t* segment;
+  long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024);     // limit the work to bound allocation times
+  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+    segment->abandoned_visits++;
+    // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments
+    // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way?
+    bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid);
+    bool has_page = mi_segment_check_free(segment,needed_slices,block_size,tld); // try to free up pages (due to concurrent frees)
+    if (segment->used == 0) {
+      // free the segment (by forced reclaim) to make it available to other threads.
+      // note1: we prefer to free a segment as that might lead to reclaiming another
+      // segment that is still partially used.
+      // note2: we could in principle optimize this by skipping reclaim and directly
+      // freeing but that would violate some invariants temporarily)
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else if (has_page && is_suitable) {
+      // found a large enough free span, or a page of the right block_size with free space
+      // we return the result of reclaim (which is usually `segment`) as it might free
+      // the segment due to concurrent frees (in which case `NULL` is returned).
+      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
+    }
+    else if (segment->abandoned_visits > 3 && is_suitable) {
+      // always reclaim on 3rd visit to limit the abandoned queue length.
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else {
+      // otherwise, push on the visited list so it gets not looked at too quickly again
+      mi_segment_delayed_decommit(segment, true /* force? */, tld->stats); // forced decommit if needed as we may not visit soon again
+      mi_abandoned_visited_push(segment);
+    }
+  }
+  return NULL;
+}
+
+
+void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld)
+{
+  mi_segment_t* segment;
+  int max_tries = (force ? 16*1024 : 1024); // limit latency
+  if (force) {
+    mi_abandoned_visited_revisit();
+  }
+  while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) {
+    mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees)
+    if (segment->used == 0) {
+      // free the segment (by forced reclaim) to make it available to other threads.
+      // note: we could in principle optimize this by skipping reclaim and directly
+      // freeing but that would violate some invariants temporarily)
+      mi_segment_reclaim(segment, heap, 0, NULL, tld);
+    }
+    else {
+      // otherwise, decommit if needed and push on the visited list
+      // note: forced decommit can be expensive if many threads are destroyed/created as in mstress.
+      mi_segment_delayed_decommit(segment, force, tld->stats);
+      mi_abandoned_visited_push(segment);
+    }
+  }
+}
+
+/* -----------------------------------------------------------
+   Reclaim or allocate
+----------------------------------------------------------- */
+
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_slices, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
+  mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX);
+
+  // 1. try to reclaim an abandoned segment
+  bool reclaimed;
+  mi_segment_t* segment = mi_segment_try_reclaim(heap, needed_slices, block_size, &reclaimed, tld);
+  if (reclaimed) {
+    // reclaimed the right page right into the heap
+    mi_assert_internal(segment != NULL);
+    return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks
+  }
+  else if (segment != NULL) {
+    // reclaimed a segment with a large enough empty span in it
+    return segment;
+  }
+  // 2. otherwise allocate a fresh segment
+  return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL);
+}
+
+
+/* -----------------------------------------------------------
+   Page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segments_page_alloc(mi_heap_t* heap, mi_page_kind_t page_kind, size_t required, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_assert_internal(required <= MI_LARGE_OBJ_SIZE_MAX && page_kind <= MI_PAGE_LARGE);
+
+  // find a free page
+  size_t page_size = _mi_align_up(required, (required > MI_MEDIUM_PAGE_SIZE ? MI_MEDIUM_PAGE_SIZE : MI_SEGMENT_SLICE_SIZE));
+  size_t slices_needed = page_size / MI_SEGMENT_SLICE_SIZE;
+  mi_assert_internal(slices_needed * MI_SEGMENT_SLICE_SIZE == page_size);
+  mi_page_t* page = mi_segments_page_find_and_allocate(slices_needed, heap->arena_id, tld); //(required <= MI_SMALL_SIZE_MAX ? 0 : slices_needed), tld);
+  if (page==NULL) {
+    // no free page, allocate a new segment and try again
+    if (mi_segment_reclaim_or_alloc(heap, slices_needed, block_size, tld, os_tld) == NULL) {
+      // OOM or reclaimed a good page in the heap
+      return NULL;
+    }
+    else {
+      // otherwise try again
+      return mi_segments_page_alloc(heap, page_kind, required, block_size, tld, os_tld);
+    }
+  }
+  mi_assert_internal(page != NULL && page->slice_count*MI_SEGMENT_SLICE_SIZE == page_size);
+  mi_assert_internal(_mi_ptr_segment(page)->thread_id == _mi_thread_id());
+  mi_segment_delayed_decommit(_mi_ptr_segment(page), false, tld->stats);
+  return page;
+}
+
+
+
+/* -----------------------------------------------------------
+   Huge page allocation
+----------------------------------------------------------- */
+
+static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
+{
+  mi_page_t* page = NULL;
+  mi_segment_t* segment = mi_segment_alloc(size,page_alignment,req_arena_id,tld,os_tld,&page);
+  if (segment == NULL || page==NULL) return NULL;
+  mi_assert_internal(segment->used==1);
+  mi_assert_internal(mi_page_block_size(page) >= size);
+  #if MI_HUGE_PAGE_ABANDON
+  segment->thread_id = 0; // huge segments are immediately abandoned
+  #endif
+
+  // for huge pages we initialize the xblock_size as we may
+  // overallocate to accommodate large alignments.
+  size_t psize;
+  uint8_t* start = _mi_segment_page_start(segment, page, &psize);
+  page->xblock_size = (psize > MI_HUGE_BLOCK_SIZE ? MI_HUGE_BLOCK_SIZE : (uint32_t)psize);
+
+  // decommit the part of the prefix of a page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE)
+  if (page_alignment > 0 && segment->allow_decommit) {
+    uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment);
+    mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment));
+    mi_assert_internal(psize - (aligned_p - start) >= size);
+    uint8_t* decommit_start = start + sizeof(mi_block_t);              // for the free list
+    ptrdiff_t decommit_size = aligned_p - decommit_start;
+    _mi_os_decommit(decommit_start, decommit_size, &_mi_stats_main);   // note: cannot use segment_decommit on huge segments
+  }
+
+  return page;
+}
+
+#if MI_HUGE_PAGE_ABANDON
+// free huge block from another thread
+void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  // huge page segments are always abandoned and can be freed immediately by any thread
+  mi_assert_internal(segment->kind==MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);
+
+  // claim it and free
+  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
+  // paranoia: if this it the last reference, the cas should always succeed
+  size_t expected_tid = 0;
+  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
+    mi_block_set_next(page, block, page->free);
+    page->free = block;
+    page->used--;
+    page->is_zero = false;
+    mi_assert(page->used == 0);
+    mi_tld_t* tld = heap->tld;
+    _mi_segment_page_free(page, true, &tld->segments);
+  }
+#if (MI_DEBUG!=0)
+  else {
+    mi_assert_internal(false);
+  }
+#endif
+}
+
+#else
+// reset memory of a huge block from another thread
+void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) {
+  MI_UNUSED(page);
+  mi_assert_internal(segment->kind == MI_SEGMENT_HUGE);
+  mi_assert_internal(segment == _mi_page_segment(page));
+  mi_assert_internal(page->used == 1); // this is called just before the free
+  mi_assert_internal(page->free == NULL);
+  if (segment->allow_decommit) {
+    const size_t csize = mi_usable_size(block) - sizeof(mi_block_t);
+    uint8_t* p = (uint8_t*)block + sizeof(mi_block_t);
+    _mi_os_decommit(p, csize, &_mi_stats_main);  // note: cannot use segment_decommit on huge segments
+  }
+}
+#endif
+
+/* -----------------------------------------------------------
+   Page allocation and free
+----------------------------------------------------------- */
+mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
+  mi_page_t* page;
+  if mi_unlikely(page_alignment > MI_ALIGNMENT_MAX) {
+    mi_assert_internal(_mi_is_power_of_two(page_alignment));
+    mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE);
+    if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; }
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
+  }
+  else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_SMALL,block_size,block_size,tld,os_tld);
+  }
+  else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_MEDIUM,MI_MEDIUM_PAGE_SIZE,block_size,tld, os_tld);
+  }
+  else if (block_size <= MI_LARGE_OBJ_SIZE_MAX) {
+    page = mi_segments_page_alloc(heap,MI_PAGE_LARGE,block_size,block_size,tld, os_tld);
+  }
+  else {
+    page = mi_segment_huge_page_alloc(block_size,page_alignment,heap->arena_id,tld,os_tld);
+  }
+  mi_assert_internal(page == NULL || _mi_heap_memid_is_suitable(heap, _mi_page_segment(page)->memid));
+  mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld));
+  return page;
+}
diff --git a/compat/mimalloc/stats.c b/compat/mimalloc/stats.c
new file mode 100644
index 0000000000..c09d816c4e
--- /dev/null
+++ b/compat/mimalloc/stats.c
@@ -0,0 +1,618 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2018-2021, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <stdio.h>  // fputs, stderr
+#include <string.h> // memset
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif
+
+/* -----------------------------------------------------------
+  Statistics operations
+----------------------------------------------------------- */
+
+static bool mi_is_in_main(void* stat) {
+  return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main
+	 && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t)));
+}
+
+static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
+  if (amount == 0) return;
+  if (mi_is_in_main(stat))
+  {
+    // add atomically (for abandoned pages)
+    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
+    if (amount > 0) {
+      mi_atomic_addi64_relaxed(&stat->allocated,amount);
+    }
+    else {
+      mi_atomic_addi64_relaxed(&stat->freed, -amount);
+    }
+  }
+  else {
+    // add thread local
+    stat->current += amount;
+    if (stat->current > stat->peak) stat->peak = stat->current;
+    if (amount > 0) {
+      stat->allocated += amount;
+    }
+    else {
+      stat->freed += -amount;
+    }
+  }
+}
+
+void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {
+  if (mi_is_in_main(stat)) {
+    mi_atomic_addi64_relaxed( &stat->count, 1 );
+    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
+  }
+  else {
+    stat->count++;
+    stat->total += amount;
+  }
+}
+
+void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, (int64_t)amount);
+}
+
+void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
+  mi_stat_update(stat, -((int64_t)amount));
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
+  if (stat==src) return;
+  if (src->allocated==0 && src->freed==0) return;
+  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
+  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
+  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
+  // peak scores do not work across threads..
+  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
+}
+
+static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
+  if (stat==src) return;
+  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
+  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
+}
+
+// must be thread safe as it is called from stats_merge
+static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
+  if (stats==src) return;
+  mi_stat_add(&stats->segments, &src->segments,1);
+  mi_stat_add(&stats->pages, &src->pages,1);
+  mi_stat_add(&stats->reserved, &src->reserved, 1);
+  mi_stat_add(&stats->committed, &src->committed, 1);
+  mi_stat_add(&stats->reset, &src->reset, 1);
+  mi_stat_add(&stats->page_committed, &src->page_committed, 1);
+
+  mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1);
+  mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1);
+  mi_stat_add(&stats->threads, &src->threads, 1);
+
+  mi_stat_add(&stats->malloc, &src->malloc, 1);
+  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
+  mi_stat_add(&stats->normal, &src->normal, 1);
+  mi_stat_add(&stats->huge, &src->huge, 1);
+  mi_stat_add(&stats->large, &src->large, 1);
+
+  mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1);
+  mi_stat_counter_add(&stats->mmap_calls, &src->mmap_calls, 1);
+  mi_stat_counter_add(&stats->commit_calls, &src->commit_calls, 1);
+
+  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
+  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
+  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
+  mi_stat_counter_add(&stats->large_count, &src->large_count, 1);
+#if MI_STAT>1
+  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
+    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
+      mi_stat_add(&stats->normal_bins[i], &src->normal_bins[i], 1);
+    }
+  }
+#endif
+}
+
+/* -----------------------------------------------------------
+  Display statistics
+----------------------------------------------------------- */
+
+// unit > 0 : size in binary bytes
+// unit == 0: count as decimal
+// unit < 0 : count in binary
+static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) {
+  char buf[32]; buf[0] = 0;
+  int  len = 32;
+  const char* suffix = (unit <= 0 ? " " : "B");
+  const int64_t base = (unit == 0 ? 1000 : 1024);
+  if (unit>0) n *= unit;
+
+  const int64_t pos = (n < 0 ? -n : n);
+  if (pos < base) {
+    if (n!=1 || suffix[0] != 'B') {  // skip printing 1 B for the unit column
+      snprintf(buf, len, "%d %-3s", (int)n, (n==0 ? "" : suffix));
+    }
+  }
+  else {
+    int64_t divider = base;
+    const char* magnitude = "K";
+    if (pos >= divider*base) { divider *= base; magnitude = "M"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "G"; }
+    const int64_t tens = (n / (divider/10));
+    const long whole = (long)(tens/10);
+    const long frac1 = (long)(tens%10);
+    char unitdesc[8];
+    snprintf(unitdesc, 8, "%s%s%s", magnitude, (base==1024 ? "i" : ""), suffix);
+    snprintf(buf, len, "%ld.%ld %-3s", whole, (frac1 < 0 ? -frac1 : frac1), unitdesc);
+  }
+  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
+}
+
+
+static void mi_print_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_printf_amount(n,unit,out,arg,NULL);
+}
+
+static void mi_print_count(int64_t n, int64_t unit, mi_output_fun* out, void* arg) {
+  if (unit==1) _mi_fprintf(out, arg, "%11s"," ");
+	  else mi_print_amount(n,0,out,arg);
+}
+
+static void mi_stat_print_ex(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg, const char* notok ) {
+  _mi_fprintf(out, arg,"%10s:", msg);
+  if (unit > 0) {
+    mi_print_amount(stat->peak, unit, out, arg);
+    mi_print_amount(stat->allocated, unit, out, arg);
+    mi_print_amount(stat->freed, unit, out, arg);
+    mi_print_amount(stat->current, unit, out, arg);
+    mi_print_amount(unit, 1, out, arg);
+    mi_print_count(stat->allocated, unit, out, arg);
+    if (stat->allocated > stat->freed) {
+      _mi_fprintf(out, arg, "  ");
+      _mi_fprintf(out, arg, (notok == NULL ? "not all freed!" : notok));
+      _mi_fprintf(out, arg, "\n");
+    }
+    else {
+      _mi_fprintf(out, arg, "  ok\n");
+    }
+  }
+  else if (unit<0) {
+    mi_print_amount(stat->peak, -1, out, arg);
+    mi_print_amount(stat->allocated, -1, out, arg);
+    mi_print_amount(stat->freed, -1, out, arg);
+    mi_print_amount(stat->current, -1, out, arg);
+    if (unit==-1) {
+      _mi_fprintf(out, arg, "%22s", "");
+    }
+    else {
+      mi_print_amount(-unit, 1, out, arg);
+      mi_print_count((stat->allocated / -unit), 0, out, arg);
+    }
+    if (stat->allocated > stat->freed)
+      _mi_fprintf(out, arg, "  not all freed!\n");
+    else
+      _mi_fprintf(out, arg, "  ok\n");
+  }
+  else {
+    mi_print_amount(stat->peak, 1, out, arg);
+    mi_print_amount(stat->allocated, 1, out, arg);
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed
+    mi_print_amount(stat->current, 1, out, arg);
+    _mi_fprintf(out, arg, "\n");
+  }
+}
+
+static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t unit, mi_output_fun* out, void* arg) {
+  mi_stat_print_ex(stat, msg, unit, out, arg, NULL);
+}
+
+static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s:", msg);
+  mi_print_amount(stat->total, -1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+}
+
+static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out, void* arg) {
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count));
+  const long avg_whole = (long)(avg_tens/10);
+  const long avg_frac1 = (long)(avg_tens%10);
+  _mi_fprintf(out, arg, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
+}
+
+
+static void mi_print_header(mi_output_fun* out, void* arg ) {
+  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak   ", "total   ", "freed   ", "current   ", "unit   ", "count   ");
+}
+
+#if MI_STAT>1
+static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
+  bool found = false;
+  char buf[64];
+  for (size_t i = 0; i <= max; i++) {
+    if (bins[i].allocated > 0) {
+      found = true;
+      int64_t unit = _mi_bin_size((uint8_t)i);
+      snprintf(buf, 64, "%s %3lu", fmt, (long)i);
+      mi_stat_print(&bins[i], buf, unit, out, arg);
+    }
+  }
+  if (found) {
+    _mi_fprintf(out, arg, "\n");
+    mi_print_header(out, arg);
+  }
+}
+#endif
+
+
+
+//------------------------------------------------------------
+// Use an output wrapper for line-buffered output
+// (which is nice when using loggers etc.)
+//------------------------------------------------------------
+typedef struct buffered_s {
+  mi_output_fun* out;   // original output function
+  void*          arg;   // and state
+  char*          buf;   // local buffer of at least size `count+1`
+  size_t         used;  // currently used chars `used <= count`
+  size_t         count; // total chars available for output
+} buffered_t;
+
+static void mi_buffered_flush(buffered_t* buf) {
+  buf->buf[buf->used] = 0;
+  _mi_fputs(buf->out, buf->arg, NULL, buf->buf);
+  buf->used = 0;
+}
+
+static void mi_cdecl mi_buffered_out(const char* msg, void* arg) {
+  buffered_t* buf = (buffered_t*)arg;
+  if (msg==NULL || buf==NULL) return;
+  for (const char* src = msg; *src != 0; src++) {
+    char c = *src;
+    if (buf->used >= buf->count) mi_buffered_flush(buf);
+    mi_assert_internal(buf->used < buf->count);
+    buf->buf[buf->used++] = c;
+    if (c == '\n') mi_buffered_flush(buf);
+  }
+}
+
+//------------------------------------------------------------
+// Print statistics
+//------------------------------------------------------------
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
+
+static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
+  // wrap the output function to be line buffered
+  char buf[256];
+  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
+  buffer.buf = buf;
+  mi_output_fun* out = &mi_buffered_out;
+  void* arg = &buffer;
+
+  // and print using that
+  mi_print_header(out,arg);
+  #if MI_STAT>1
+  mi_stats_print_bins(stats->normal_bins, MI_BIN_HUGE, "normal",out,arg);
+  #endif
+  #if MI_STAT
+  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
+  mi_stat_print(&stats->large, "large", (stats->large_count.count == 0 ? 1 : -(stats->large.allocated / stats->large_count.count)), out, arg);
+  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
+  mi_stat_count_t total = { 0,0,0,0 };
+  mi_stat_add(&total, &stats->normal, 1);
+  mi_stat_add(&total, &stats->large, 1);
+  mi_stat_add(&total, &stats->huge, 1);
+  mi_stat_print(&total, "total", 1, out, arg);
+  #endif
+  #if MI_STAT>1
+  mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
+  _mi_fprintf(out, arg, "\n");
+  #endif
+  mi_stat_print_ex(&stats->reserved, "reserved", 1, out, arg, "");
+  mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, "");
+  mi_stat_print(&stats->reset, "reset", 1, out, arg);
+  mi_stat_print(&stats->page_committed, "touched", 1, out, arg);
+  mi_stat_print(&stats->segments, "segments", -1, out, arg);
+  mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg);
+  mi_stat_print(&stats->pages, "pages", -1, out, arg);
+  mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg);
+  mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg);
+  mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg);
+  mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg);
+  mi_stat_counter_print(&stats->commit_calls, "commits", out, arg);
+  mi_stat_print(&stats->threads, "threads", -1, out, arg);
+  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
+  _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count());
+
+  mi_msecs_t elapsed;
+  mi_msecs_t user_time;
+  mi_msecs_t sys_time;
+  size_t current_rss;
+  size_t peak_rss;
+  size_t current_commit;
+  size_t peak_commit;
+  size_t page_faults;
+  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+	      user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
+  mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
+  if (peak_commit > 0) {
+    _mi_fprintf(out, arg, ", commit: ");
+    mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
+  }
+  _mi_fprintf(out, arg, "\n");
+}
+
+static mi_msecs_t mi_process_start; // = 0
+
+static mi_stats_t* mi_stats_get_default(void) {
+  mi_heap_t* heap = mi_heap_get_default();
+  return &heap->tld->stats;
+}
+
+static void mi_stats_merge_from(mi_stats_t* stats) {
+  if (stats != &_mi_stats_main) {
+    mi_stats_add(&_mi_stats_main, stats);
+    memset(stats, 0, sizeof(mi_stats_t));
+  }
+}
+
+void mi_stats_reset(void) mi_attr_noexcept {
+  mi_stats_t* stats = mi_stats_get_default();
+  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
+  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
+  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
+}
+
+void mi_stats_merge(void) mi_attr_noexcept {
+  mi_stats_merge_from( mi_stats_get_default() );
+}
+
+void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
+  mi_stats_merge_from(stats);
+}
+
+void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  mi_stats_merge_from(mi_stats_get_default());
+  _mi_stats_print(&_mi_stats_main, out, arg);
+}
+
+void mi_stats_print(void* out) mi_attr_noexcept {
+  // for compatibility there is an `out` parameter (which can be `stdout` or `stderr`)
+  mi_stats_print_out((mi_output_fun*)out, NULL);
+}
+
+void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
+  _mi_stats_print(mi_stats_get_default(), out, arg);
+}
+
+
+// ----------------------------------------------------------------
+// Basic timer for convenience; use milli-seconds to avoid doubles
+// ----------------------------------------------------------------
+#ifdef _WIN32
+#include <windows.h>
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
+    LARGE_INTEGER f;
+    QueryPerformanceFrequency(&f);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
+  }
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);
+}
+
+mi_msecs_t _mi_clock_now(void) {
+  LARGE_INTEGER t;
+  QueryPerformanceCounter(&t);
+  return mi_to_msecs(t);
+}
+#else
+#include <time.h>
+#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC)
+mi_msecs_t _mi_clock_now(void) {
+  struct timespec t;
+  #ifdef CLOCK_MONOTONIC
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  #else
+  clock_gettime(CLOCK_REALTIME, &t);
+  #endif
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
+}
+#else
+// low resolution timer
+mi_msecs_t _mi_clock_now(void) {
+  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
+}
+#endif
+#endif
+
+
+static mi_msecs_t mi_clock_diff;
+
+mi_msecs_t _mi_clock_start(void) {
+  if (mi_clock_diff == 0.0) {
+    mi_msecs_t t0 = _mi_clock_now();
+    mi_clock_diff = _mi_clock_now() - t0;
+  }
+  return _mi_clock_now();
+}
+
+mi_msecs_t _mi_clock_end(mi_msecs_t start) {
+  mi_msecs_t end = _mi_clock_now();
+  return (end - start - mi_clock_diff);
+}
+
+
+// --------------------------------------------------------
+// Basic process statistics
+// --------------------------------------------------------
+
+#if defined(_WIN32)
+#include <windows.h>
+
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
+  ULARGE_INTEGER i;
+  i.LowPart = ftime->dwLowDateTime;
+  i.HighPart = ftime->dwHighDateTime;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
+}
+
+typedef struct _PROCESS_MEMORY_COUNTERS {
+  DWORD cb;
+  DWORD PageFaultCount;
+  SIZE_T PeakWorkingSetSize;
+  SIZE_T WorkingSetSize;
+  SIZE_T QuotaPeakPagedPoolUsage;
+  SIZE_T QuotaPagedPoolUsage;
+  SIZE_T QuotaPeakNonPagedPoolUsage;
+  SIZE_T QuotaNonPagedPoolUsage;
+  SIZE_T PagefileUsage;
+  SIZE_T PeakPagefileUsage;
+} PROCESS_MEMORY_COUNTERS;
+typedef PROCESS_MEMORY_COUNTERS* PPROCESS_MEMORY_COUNTERS;
+typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD);
+static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL;
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  FILETIME ct;
+  FILETIME ut;
+  FILETIME st;
+  FILETIME et;
+  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
+  *utime = filetime_msecs(&ut);
+  *stime = filetime_msecs(&st);
+
+  // load psapi on demand
+  if (pGetProcessMemoryInfo == NULL) {
+    HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll"));
+    if (hDll != NULL) {
+      pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo");
+    }
+  }
+
+  // get process info
+  PROCESS_MEMORY_COUNTERS info;
+  memset(&info, 0, sizeof(info));
+  if (pGetProcessMemoryInfo != NULL) {
+    pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  }
+  *current_rss    = (size_t)info.WorkingSetSize;
+  *peak_rss       = (size_t)info.PeakWorkingSetSize;
+  *current_commit = (size_t)info.PagefileUsage;
+  *peak_commit    = (size_t)info.PeakPagefileUsage;
+  *page_faults    = (size_t)info.PageFaultCount;
+}
+
+#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__))
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/resource.h>
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#endif
+
+#if defined(__HAIKU__)
+#include <kernel/OS.h>
+#endif
+
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
+}
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+  *utime = timeval_secs(&rusage.ru_utime);
+  *stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  *page_faults = rusage.ru_majflt;
+#endif
+  // estimate commit using our stats
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *current_rss    = *current_commit;  // estimate
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    *peak_rss += mem.ram_size;
+  }
+  *page_faults = 0;
+#elif defined(__APPLE__)
+  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    *current_rss = (size_t)info.resident_size;
+  }
+#else
+  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
+#endif
+}
+
+#else
+#ifndef __wasi__
+// WebAssembly instances are not processes
+#pragma message("define a way to get process info")
+#endif
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *peak_rss    = *peak_commit;
+  *current_rss = *current_commit;
+  *page_faults = 0;
+  *utime = 0;
+  *stime = 0;
+}
+#endif
+
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
+{
+  mi_msecs_t elapsed = 0;
+  mi_msecs_t utime = 0;
+  mi_msecs_t stime = 0;
+  size_t current_rss0 = 0;
+  size_t peak_rss0 = 0;
+  size_t current_commit0 = 0;
+  size_t peak_commit0 = 0;
+  size_t page_faults0 = 0;
+  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
+  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = current_rss0;
+  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
+  if (current_commit!=NULL) *current_commit = current_commit0;
+  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
+  if (page_faults!=NULL)    *page_faults    = page_faults0;
+}
diff --git a/compat/mingw.c b/compat/mingw.c
index ec5280da16..6fd8004a5c 100644
--- a/compat/mingw.c
+++ b/compat/mingw.c
@@ -18,6 +18,8 @@
 #include "gettext.h"
 #define SECURITY_WIN32
 #include <sspi.h>
+#include "../write-or-die.h"
+#include "../repository.h"
 
 #define HCAST(type, handle) ((type)(intptr_t)handle)
 
@@ -545,6 +547,7 @@ static int is_local_named_pipe_path(const char *filename)
 
 int mingw_open (const char *filename, int oflags, ...)
 {
+	static int append_atomically = -1;
 	typedef int (*open_fn_t)(wchar_t const *wfilename, int oflags, ...);
 	va_list args;
 	unsigned mode;
@@ -561,7 +564,16 @@ int mingw_open (const char *filename, int oflags, ...)
 		return -1;
 	}
 
-	if ((oflags & O_APPEND) && !is_local_named_pipe_path(filename))
+	/*
+	 * Only set append_atomically to default value(1) when repo is initialized
+	 * and fail to get config value
+	 */
+	if (append_atomically < 0 && the_repository && the_repository->commondir &&
+		git_config_get_bool("windows.appendatomically", &append_atomically))
+		append_atomically = 1;
+
+	if (append_atomically && (oflags & O_APPEND) &&
+		!is_local_named_pipe_path(filename))
 		open_fn = mingw_open_append;
 	else
 		open_fn = _wopen;
@@ -705,13 +717,31 @@ ssize_t mingw_write(int fd, const void *buf, size_t len)
 {
 	ssize_t result = write(fd, buf, len);
 
-	if (result < 0 && errno == EINVAL && buf) {
+	if (result < 0 && (errno == EINVAL || errno == EBADF) && buf) {
 		/* check if fd is a pipe */
 		HANDLE h = (HANDLE) _get_osfhandle(fd);
 		if (GetFileType(h) == FILE_TYPE_PIPE)
 			errno = EPIPE;
-		else
+		else {
+			wchar_t path[MAX_PATH];
+			DWORD ret = GetFinalPathNameByHandleW(h, path,
+							ARRAY_SIZE(path), 0);
+			UINT drive_type = ret > 0 && ret < ARRAY_SIZE(path) ?
+				GetDriveTypeW(path) : DRIVE_UNKNOWN;
+
+			/*
+			 * The default atomic append causes such an error on
+			 * network file systems, in such a case, it should be
+			 * turned off via config.
+			 *
+			 * `drive_type` of UNC path: DRIVE_NO_ROOT_DIR
+			 */
+			if (DRIVE_NO_ROOT_DIR == drive_type || DRIVE_REMOTE == drive_type)
+				warning("invalid write operation detected; you may try:\n"
+					"\n\tgit config windows.appendAtomically false");
+
 			errno = EINVAL;
+		}
 	}
 
 	return result;
@@ -1059,11 +1089,19 @@ unsigned int sleep (unsigned int seconds)
 char *mingw_mktemp(char *template)
 {
 	wchar_t wtemplate[MAX_PATH];
+	int offset = 0;
+
 	if (xutftowcs_path(wtemplate, template) < 0)
 		return NULL;
+
+	if (is_dir_sep(template[0]) && !is_dir_sep(template[1]) &&
+	    iswalpha(wtemplate[0]) && wtemplate[1] == L':') {
+		/* We have an absolute path missing the drive prefix */
+		offset = 2;
+	}
 	if (!_wmktemp(wtemplate))
 		return NULL;
-	if (xwcstoutf(template, wtemplate, strlen(template) + 1) < 0)
+	if (xwcstoutf(template, wtemplate + offset, strlen(template) + 1) < 0)
 		return NULL;
 	return template;
 }
@@ -1125,37 +1163,114 @@ struct tm *localtime_r(const time_t *timep, struct tm *result)
 }
 #endif
 
+char *mingw_strbuf_realpath(struct strbuf *resolved, const char *path)
+{
+	wchar_t wpath[MAX_PATH];
+	HANDLE h;
+	DWORD ret;
+	int len;
+	const char *last_component = NULL;
+	char *append = NULL;
+
+	if (xutftowcs_path(wpath, path) < 0)
+		return NULL;
+
+	h = CreateFileW(wpath, 0,
+			FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+			OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+
+	/*
+	 * strbuf_realpath() allows the last path component to not exist. If
+	 * that is the case, now it's time to try without last component.
+	 */
+	if (h == INVALID_HANDLE_VALUE &&
+	    GetLastError() == ERROR_FILE_NOT_FOUND) {
+		/* cut last component off of `wpath` */
+		wchar_t *p = wpath + wcslen(wpath);
+
+		while (p != wpath)
+			if (*(--p) == L'/' || *p == L'\\')
+				break; /* found start of last component */
+
+		if (p != wpath && (last_component = find_last_dir_sep(path))) {
+			append = xstrdup(last_component + 1); /* skip directory separator */
+			/*
+			 * Do not strip the trailing slash at the drive root, otherwise
+			 * the path would be e.g. `C:` (which resolves to the
+			 * _current_ directory on that drive).
+			 */
+			if (p[-1] == L':')
+				p[1] = L'\0';
+			else
+				*p = L'\0';
+			h = CreateFileW(wpath, 0, FILE_SHARE_READ |
+					FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+					NULL, OPEN_EXISTING,
+					FILE_FLAG_BACKUP_SEMANTICS, NULL);
+		}
+	}
+
+	if (h == INVALID_HANDLE_VALUE) {
+realpath_failed:
+		FREE_AND_NULL(append);
+		return NULL;
+	}
+
+	ret = GetFinalPathNameByHandleW(h, wpath, ARRAY_SIZE(wpath), 0);
+	CloseHandle(h);
+	if (!ret || ret >= ARRAY_SIZE(wpath))
+		goto realpath_failed;
+
+	len = wcslen(wpath) * 3;
+	strbuf_grow(resolved, len);
+	len = xwcstoutf(resolved->buf, normalize_ntpath(wpath), len);
+	if (len < 0)
+		goto realpath_failed;
+	resolved->len = len;
+
+	if (append) {
+		/* Use forward-slash, like `normalize_ntpath()` */
+		strbuf_complete(resolved, '/');
+		strbuf_addstr(resolved, append);
+		FREE_AND_NULL(append);
+	}
+
+	return resolved->buf;
+
+}
+
 char *mingw_getcwd(char *pointer, int len)
 {
 	wchar_t cwd[MAX_PATH], wpointer[MAX_PATH];
 	DWORD ret = GetCurrentDirectoryW(ARRAY_SIZE(cwd), cwd);
+	HANDLE hnd;
 
 	if (!ret || ret >= ARRAY_SIZE(cwd)) {
 		errno = ret ? ENAMETOOLONG : err_win_to_posix(GetLastError());
 		return NULL;
 	}
-	ret = GetLongPathNameW(cwd, wpointer, ARRAY_SIZE(wpointer));
-	if (!ret && GetLastError() == ERROR_ACCESS_DENIED) {
-		HANDLE hnd = CreateFileW(cwd, 0,
-			FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
-			OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
-		if (hnd == INVALID_HANDLE_VALUE)
-			return NULL;
+	hnd = CreateFileW(cwd, 0,
+			  FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE, NULL,
+			  OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+	if (hnd != INVALID_HANDLE_VALUE) {
 		ret = GetFinalPathNameByHandleW(hnd, wpointer, ARRAY_SIZE(wpointer), 0);
 		CloseHandle(hnd);
-		if (!ret || ret >= ARRAY_SIZE(wpointer))
-			return NULL;
+		if (!ret || ret >= ARRAY_SIZE(wpointer)) {
+			ret = GetLongPathNameW(cwd, wpointer, ARRAY_SIZE(wpointer));
+			if (!ret || ret >= ARRAY_SIZE(wpointer)) {
+				errno = ret ? ENAMETOOLONG : err_win_to_posix(GetLastError());
+				return NULL;
+			}
+		}
 		if (xwcstoutf(pointer, normalize_ntpath(wpointer), len) < 0)
 			return NULL;
 		return pointer;
 	}
-	if (!ret || ret >= ARRAY_SIZE(wpointer))
-		return NULL;
-	if (GetFileAttributesW(wpointer) == INVALID_FILE_ATTRIBUTES) {
+	if (GetFileAttributesW(cwd) == INVALID_FILE_ATTRIBUTES) {
 		errno = ENOENT;
 		return NULL;
 	}
-	if (xwcstoutf(pointer, wpointer, len) < 0)
+	if (xwcstoutf(pointer, cwd, len) < 0)
 		return NULL;
 	convert_slashes(pointer);
 	return pointer;
@@ -1257,7 +1372,7 @@ static const char *quote_arg_msys2(const char *arg)
 
 static const char *parse_interpreter(const char *cmd)
 {
-	static char buf[100];
+	static char buf[MAX_PATH];
 	char *p, *opt;
 	int n, fd;
 
@@ -2029,18 +2144,150 @@ static void ensure_socket_initialization(void)
 	initialized = 1;
 }
 
+static int winsock_error_to_errno(DWORD err)
+{
+	switch (err) {
+	case WSAEINTR: return EINTR;
+	case WSAEBADF: return EBADF;
+	case WSAEACCES: return EACCES;
+	case WSAEFAULT: return EFAULT;
+	case WSAEINVAL: return EINVAL;
+	case WSAEMFILE: return EMFILE;
+	case WSAEWOULDBLOCK: return EWOULDBLOCK;
+	case WSAEINPROGRESS: return EINPROGRESS;
+	case WSAEALREADY: return EALREADY;
+	case WSAENOTSOCK: return ENOTSOCK;
+	case WSAEDESTADDRREQ: return EDESTADDRREQ;
+	case WSAEMSGSIZE: return EMSGSIZE;
+	case WSAEPROTOTYPE: return EPROTOTYPE;
+	case WSAENOPROTOOPT: return ENOPROTOOPT;
+	case WSAEPROTONOSUPPORT: return EPROTONOSUPPORT;
+	case WSAEOPNOTSUPP: return EOPNOTSUPP;
+	case WSAEAFNOSUPPORT: return EAFNOSUPPORT;
+	case WSAEADDRINUSE: return EADDRINUSE;
+	case WSAEADDRNOTAVAIL: return EADDRNOTAVAIL;
+	case WSAENETDOWN: return ENETDOWN;
+	case WSAENETUNREACH: return ENETUNREACH;
+	case WSAENETRESET: return ENETRESET;
+	case WSAECONNABORTED: return ECONNABORTED;
+	case WSAECONNRESET: return ECONNRESET;
+	case WSAENOBUFS: return ENOBUFS;
+	case WSAEISCONN: return EISCONN;
+	case WSAENOTCONN: return ENOTCONN;
+	case WSAETIMEDOUT: return ETIMEDOUT;
+	case WSAECONNREFUSED: return ECONNREFUSED;
+	case WSAELOOP: return ELOOP;
+	case WSAENAMETOOLONG: return ENAMETOOLONG;
+	case WSAEHOSTUNREACH: return EHOSTUNREACH;
+	case WSAENOTEMPTY: return ENOTEMPTY;
+	/* No errno equivalent; default to EIO */
+	case WSAESOCKTNOSUPPORT:
+	case WSAEPFNOSUPPORT:
+	case WSAESHUTDOWN:
+	case WSAETOOMANYREFS:
+	case WSAEHOSTDOWN:
+	case WSAEPROCLIM:
+	case WSAEUSERS:
+	case WSAEDQUOT:
+	case WSAESTALE:
+	case WSAEREMOTE:
+	case WSASYSNOTREADY:
+	case WSAVERNOTSUPPORTED:
+	case WSANOTINITIALISED:
+	case WSAEDISCON:
+	case WSAENOMORE:
+	case WSAECANCELLED:
+	case WSAEINVALIDPROCTABLE:
+	case WSAEINVALIDPROVIDER:
+	case WSAEPROVIDERFAILEDINIT:
+	case WSASYSCALLFAILURE:
+	case WSASERVICE_NOT_FOUND:
+	case WSATYPE_NOT_FOUND:
+	case WSA_E_NO_MORE:
+	case WSA_E_CANCELLED:
+	case WSAEREFUSED:
+	case WSAHOST_NOT_FOUND:
+	case WSATRY_AGAIN:
+	case WSANO_RECOVERY:
+	case WSANO_DATA:
+	case WSA_QOS_RECEIVERS:
+	case WSA_QOS_SENDERS:
+	case WSA_QOS_NO_SENDERS:
+	case WSA_QOS_NO_RECEIVERS:
+	case WSA_QOS_REQUEST_CONFIRMED:
+	case WSA_QOS_ADMISSION_FAILURE:
+	case WSA_QOS_POLICY_FAILURE:
+	case WSA_QOS_BAD_STYLE:
+	case WSA_QOS_BAD_OBJECT:
+	case WSA_QOS_TRAFFIC_CTRL_ERROR:
+	case WSA_QOS_GENERIC_ERROR:
+	case WSA_QOS_ESERVICETYPE:
+	case WSA_QOS_EFLOWSPEC:
+	case WSA_QOS_EPROVSPECBUF:
+	case WSA_QOS_EFILTERSTYLE:
+	case WSA_QOS_EFILTERTYPE:
+	case WSA_QOS_EFILTERCOUNT:
+	case WSA_QOS_EOBJLENGTH:
+	case WSA_QOS_EFLOWCOUNT:
+#ifndef _MSC_VER
+	case WSA_QOS_EUNKNOWNPSOBJ:
+#endif
+	case WSA_QOS_EPOLICYOBJ:
+	case WSA_QOS_EFLOWDESC:
+	case WSA_QOS_EPSFLOWSPEC:
+	case WSA_QOS_EPSFILTERSPEC:
+	case WSA_QOS_ESDMODEOBJ:
+	case WSA_QOS_ESHAPERATEOBJ:
+	case WSA_QOS_RESERVED_PETYPE:
+	default: return EIO;
+	}
+}
+
+/*
+ * On Windows, `errno` is a global macro to a function call.
+ * This makes it difficult to debug and single-step our mappings.
+ */
+static inline void set_wsa_errno(void)
+{
+	DWORD wsa = WSAGetLastError();
+	int e = winsock_error_to_errno(wsa);
+	errno = e;
+
+#ifdef DEBUG_WSA_ERRNO
+	fprintf(stderr, "winsock error: %d -> %d\n", wsa, e);
+	fflush(stderr);
+#endif
+}
+
+static inline int winsock_return(int ret)
+{
+	if (ret < 0)
+		set_wsa_errno();
+
+	return ret;
+}
+
+#define WINSOCK_RETURN(x) do { return winsock_return(x); } while (0)
+
 #undef gethostname
 int mingw_gethostname(char *name, int namelen)
 {
-    ensure_socket_initialization();
-    return gethostname(name, namelen);
+	ensure_socket_initialization();
+	WINSOCK_RETURN(gethostname(name, namelen));
 }
 
 #undef gethostbyname
 struct hostent *mingw_gethostbyname(const char *host)
 {
+	struct hostent *ret;
+
 	ensure_socket_initialization();
-	return gethostbyname(host);
+
+	ret = gethostbyname(host);
+	if (!ret)
+		set_wsa_errno();
+
+	return ret;
 }
 
 #undef getaddrinfo
@@ -2048,7 +2295,7 @@ int mingw_getaddrinfo(const char *node, const char *service,
 		      const struct addrinfo *hints, struct addrinfo **res)
 {
 	ensure_socket_initialization();
-	return getaddrinfo(node, service, hints, res);
+	WINSOCK_RETURN(getaddrinfo(node, service, hints, res));
 }
 
 int mingw_socket(int domain, int type, int protocol)
@@ -2068,7 +2315,7 @@ int mingw_socket(int domain, int type, int protocol)
 		 * in errno so that _if_ someone looks up the code somewhere,
 		 * then it is at least the number that are usually listed.
 		 */
-		errno = WSAGetLastError();
+		set_wsa_errno();
 		return -1;
 	}
 	/* convert into a file descriptor */
@@ -2084,35 +2331,35 @@ int mingw_socket(int domain, int type, int protocol)
 int mingw_connect(int sockfd, struct sockaddr *sa, size_t sz)
 {
 	SOCKET s = (SOCKET)_get_osfhandle(sockfd);
-	return connect(s, sa, sz);
+	WINSOCK_RETURN(connect(s, sa, sz));
 }
 
 #undef bind
 int mingw_bind(int sockfd, struct sockaddr *sa, size_t sz)
 {
 	SOCKET s = (SOCKET)_get_osfhandle(sockfd);
-	return bind(s, sa, sz);
+	WINSOCK_RETURN(bind(s, sa, sz));
 }
 
 #undef setsockopt
 int mingw_setsockopt(int sockfd, int lvl, int optname, void *optval, int optlen)
 {
 	SOCKET s = (SOCKET)_get_osfhandle(sockfd);
-	return setsockopt(s, lvl, optname, (const char*)optval, optlen);
+	WINSOCK_RETURN(setsockopt(s, lvl, optname, (const char*)optval, optlen));
 }
 
 #undef shutdown
 int mingw_shutdown(int sockfd, int how)
 {
 	SOCKET s = (SOCKET)_get_osfhandle(sockfd);
-	return shutdown(s, how);
+	WINSOCK_RETURN(shutdown(s, how));
 }
 
 #undef listen
 int mingw_listen(int sockfd, int backlog)
 {
 	SOCKET s = (SOCKET)_get_osfhandle(sockfd);
-	return listen(s, backlog);
+	WINSOCK_RETURN(listen(s, backlog));
 }
 
 #undef accept
@@ -2123,6 +2370,11 @@ int mingw_accept(int sockfd1, struct sockaddr *sa, socklen_t *sz)
 	SOCKET s1 = (SOCKET)_get_osfhandle(sockfd1);
 	SOCKET s2 = accept(s1, sa, sz);
 
+	if (s2 == INVALID_SOCKET) {
+		set_wsa_errno();
+		return -1;
+	}
+
 	/* convert into a file descriptor */
 	if ((sockfd2 = _open_osfhandle(s2, O_RDWR|O_BINARY)) < 0) {
 		int err = errno;
@@ -2517,6 +2769,28 @@ pid_t waitpid(pid_t pid, int *status, int options)
 	return -1;
 }
 
+int mingw_is_mount_point(struct strbuf *path)
+{
+	WIN32_FIND_DATAW findbuf = { 0 };
+	HANDLE handle;
+	wchar_t wfilename[MAX_PATH];
+	int wlen = xutftowcs_path(wfilename, path->buf);
+	if (wlen < 0)
+		die(_("could not get long path for '%s'"), path->buf);
+
+	/* remove trailing slash, if any */
+	if (wlen > 0 && wfilename[wlen - 1] == L'/')
+		wfilename[--wlen] = L'\0';
+
+	handle = FindFirstFileW(wfilename, &findbuf);
+	if (handle == INVALID_HANDLE_VALUE)
+		return 0;
+	FindClose(handle);
+
+	return (findbuf.dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) &&
+		(findbuf.dwReserved0 == IO_REPARSE_TAG_MOUNT_POINT);
+}
+
 int xutftowcsn(wchar_t *wcs, const char *utfs, size_t wcslen, int utflen)
 {
 	int upos = 0, wpos = 0;
@@ -2602,6 +2876,59 @@ int xwcstoutf(char *utf, const wchar_t *wcs, size_t utflen)
 	return -1;
 }
 
+#ifdef ENSURE_MSYSTEM_IS_SET
+static size_t append_system_bin_dirs(char *path, size_t size)
+{
+#if !defined(RUNTIME_PREFIX) || !defined(HAVE_WPGMPTR)
+	return 0;
+#else
+	char prefix[32768];
+	const char *slash;
+	size_t len = xwcstoutf(prefix, _wpgmptr, sizeof(prefix)), off = 0;
+
+	if (len == 0 || len >= sizeof(prefix) ||
+	    !(slash = find_last_dir_sep(prefix)))
+		return 0;
+	/* strip trailing `git.exe` */
+	len = slash - prefix;
+
+	/* strip trailing `cmd` or `mingw64\bin` or `mingw32\bin` or `bin` or `libexec\git-core` */
+	if (strip_suffix_mem(prefix, &len, "\\mingw64\\libexec\\git-core") ||
+	    strip_suffix_mem(prefix, &len, "\\mingw64\\bin"))
+		off += xsnprintf(path + off, size - off,
+				 "%.*s\\mingw64\\bin;", (int)len, prefix);
+	else if (strip_suffix_mem(prefix, &len, "\\mingw32\\libexec\\git-core") ||
+		 strip_suffix_mem(prefix, &len, "\\mingw32\\bin"))
+		off += xsnprintf(path + off, size - off,
+				 "%.*s\\mingw32\\bin;", (int)len, prefix);
+	else if (strip_suffix_mem(prefix, &len, "\\cmd") ||
+		 strip_suffix_mem(prefix, &len, "\\bin") ||
+		 strip_suffix_mem(prefix, &len, "\\libexec\\git-core"))
+		off += xsnprintf(path + off, size - off,
+				 "%.*s\\mingw%d\\bin;", (int)len, prefix,
+				 (int)(sizeof(void *) * 8));
+	else
+		return 0;
+
+	off += xsnprintf(path + off, size - off,
+			 "%.*s\\usr\\bin;", (int)len, prefix);
+	return off;
+#endif
+}
+#endif
+
+static int is_system32_path(const char *path)
+{
+	WCHAR system32[MAX_PATH], wpath[MAX_PATH];
+
+	if (xutftowcs_path(wpath, path) < 0 ||
+	    !GetSystemDirectoryW(system32, ARRAY_SIZE(system32)) ||
+	    _wcsicmp(system32, wpath))
+		return 0;
+
+	return 1;
+}
+
 static void setup_windows_environment(void)
 {
 	char *tmp = getenv("TMPDIR");
@@ -2626,9 +2953,20 @@ static void setup_windows_environment(void)
 		convert_slashes(tmp);
 	}
 
-	/* simulate TERM to enable auto-color (see color.c) */
-	if (!getenv("TERM"))
-		setenv("TERM", "cygwin", 1);
+
+	/*
+	 * Make sure TERM is set up correctly to enable auto-color
+	 * (see color.c .) Use "cygwin" for older OS releases which
+	 * works correctly with MSYS2 utilities on older consoles.
+	 */
+	if (!getenv("TERM")) {
+		if ((GetVersion() >> 16) < 15063)
+			setenv("TERM", "cygwin", 0);
+		else {
+			setenv("TERM", "xterm-256color", 0);
+			setenv("COLORTERM", "truecolor", 0);
+		}
+	}
 
 	/* calculate HOME if not set */
 	if (!getenv("HOME")) {
@@ -2642,7 +2980,8 @@ static void setup_windows_environment(void)
 			strbuf_addstr(&buf, tmp);
 			if ((tmp = getenv("HOMEPATH"))) {
 				strbuf_addstr(&buf, tmp);
-				if (is_directory(buf.buf))
+				if (!is_system32_path(buf.buf) &&
+				    is_directory(buf.buf))
 					setenv("HOME", buf.buf, 1);
 				else
 					tmp = NULL; /* use $USERPROFILE */
@@ -2653,6 +2992,37 @@ static void setup_windows_environment(void)
 		if (!tmp && (tmp = getenv("USERPROFILE")))
 			setenv("HOME", tmp, 1);
 	}
+
+	if (!getenv("PLINK_PROTOCOL"))
+		setenv("PLINK_PROTOCOL", "ssh", 0);
+
+#ifdef ENSURE_MSYSTEM_IS_SET
+	if (!(tmp = getenv("MSYSTEM")) || !tmp[0]) {
+		const char *home = getenv("HOME"), *path = getenv("PATH");
+		char buf[32768];
+		size_t off = 0;
+
+		xsnprintf(buf, sizeof(buf),
+			  "MINGW%d", (int)(sizeof(void *) * 8));
+		setenv("MSYSTEM", buf, 1);
+
+		if (home)
+			off += xsnprintf(buf + off, sizeof(buf) - off,
+					 "%s\\bin;", home);
+		off += append_system_bin_dirs(buf + off, sizeof(buf) - off);
+		if (path)
+			off += xsnprintf(buf + off, sizeof(buf) - off,
+					 "%s", path);
+		else if (off > 0)
+			buf[off - 1] = '\0';
+		else
+			buf[0] = '\0';
+		setenv("PATH", buf, 1);
+	}
+#endif
+
+	if (!getenv("LC_ALL") && !getenv("LC_CTYPE") && !getenv("LANG"))
+		setenv("LC_CTYPE", "C.UTF-8", 1);
 }
 
 static PSID get_current_user_sid(void)
@@ -2731,9 +3101,7 @@ int is_path_owned_by_current_sid(const char *path, struct strbuf *report)
 				    DACL_SECURITY_INFORMATION,
 				    &sid, NULL, NULL, NULL, &descriptor);
 
-	if (err != ERROR_SUCCESS)
-		error(_("failed to get owner for '%s' (%ld)"), path, err);
-	else if (sid && IsValidSid(sid)) {
+	if (err == ERROR_SUCCESS && sid && IsValidSid(sid)) {
 		/* Now, verify that the SID matches the current user's */
 		static PSID current_user_sid;
 		BOOL is_member;
@@ -3039,6 +3407,7 @@ int wmain(int argc, const wchar_t **wargv)
 #endif
 
 	maybe_redirect_std_handles();
+	fsync_object_files = 1;
 
 	/* determine size of argv and environ conversion buffer */
 	maxlen = wcslen(wargv[0]);
diff --git a/compat/mingw.h b/compat/mingw.h
index 6aec50e412..62f9d0d364 100644
--- a/compat/mingw.h
+++ b/compat/mingw.h
@@ -454,9 +454,16 @@ static inline void convert_slashes(char *path)
 		if (*path == '\\')
 			*path = '/';
 }
+struct strbuf;
+int mingw_is_mount_point(struct strbuf *path);
+#define is_mount_point mingw_is_mount_point
+#define CAN_UNLINK_MOUNT_POINTS 1
 #define PATH_SEP ';'
 char *mingw_query_user_email(void);
 #define query_user_email mingw_query_user_email
+struct strbuf;
+char *mingw_strbuf_realpath(struct strbuf *resolved, const char *path);
+#define platform_strbuf_realpath mingw_strbuf_realpath
 #if !defined(__MINGW64_VERSION_MAJOR) && (!defined(_MSC_VER) || _MSC_VER < 1800)
 #define PRIuMAX "I64u"
 #define PRId64 "I64d"
diff --git a/compat/vcbuild/README b/compat/vcbuild/README
index 29ec1d0f10..9ac9760397 100644
--- a/compat/vcbuild/README
+++ b/compat/vcbuild/README
@@ -6,7 +6,11 @@ The Steps to Build Git with VS2015 or VS2017 from the command line.
    Prompt or from an SDK bash window:
 
    $ cd <repo_root>
-   $ ./compat/vcbuild/vcpkg_install.bat
+   $ ./compat/vcbuild/vcpkg_install.bat x64-windows
+
+   or
+
+   $ ./compat/vcbuild/vcpkg_install.bat arm64-windows
 
    The vcpkg tools and all of the third-party sources will be installed
    in this folder:
@@ -37,27 +41,17 @@ The Steps to Build Git with VS2015 or VS2017 from the command line.
 
 ================================================================
 
-Alternatively, run `make vcxproj` and then load the generated `git.sln` in
-Visual Studio. The initial build will install the vcpkg system and build the
+Alternatively, just open Git's top-level directory in Visual Studio, via
+`File>Open>Folder...`. This will use CMake internally to generate the
+project definitions. It will also install the vcpkg system and build the
 dependencies automatically. This will take a while.
 
-Instead of generating the `git.sln` file yourself (which requires a full Git
-for Windows SDK), you may want to consider fetching the `vs/master` branch of
-https://github.com/git-for-windows/git instead (which is updated automatically
-via CI running `make vcxproj`). The `vs/master` branch does not require a Git
-for Windows to build, but you can run the test scripts in a regular Git Bash.
+You can also generate the Visual Studio solution manually by downloading
+and running CMake explicitly rather than letting Visual Studio doing
+that implicitly.
 
-Note that `make vcxproj` will automatically add and commit the generated `.sln`
-and `.vcxproj` files to the repo. This is necessary to allow building a
-fully-testable Git in Visual Studio, where a regular Git Bash can be used to
-run the test scripts (as opposed to a full Git for Windows SDK): a number of
-build targets, such as Git commands implemented as Unix shell scripts (where
-`@@SHELL_PATH@@` and other placeholders are interpolated) require a full-blown
-Git for Windows SDK (which is about 10x the size of a regular Git for Windows
-installation).
-
-If your plan is to open a Pull Request with Git for Windows, it is a good idea
-to drop this commit before submitting.
+Another, deprecated option is to run `make vcxproj`. This option is
+superseded by the CMake-based build, and will be removed at some point.
 
 ================================================================
 The Steps of Build Git with VS2008
diff --git a/compat/vcbuild/find_vs_env.bat b/compat/vcbuild/find_vs_env.bat
index b35d264c0e..379b16296e 100644
--- a/compat/vcbuild/find_vs_env.bat
+++ b/compat/vcbuild/find_vs_env.bat
@@ -99,6 +99,7 @@ REM ================================================================
 
    SET sdk_dir=%WindowsSdkDir%
    SET sdk_ver=%WindowsSDKVersion%
+   SET sdk_ver_bin_dir=%WindowsSdkVerBinPath%%tgt%
    SET si=%sdk_dir%Include\%sdk_ver%
    SET sdk_includes=-I"%si%ucrt" -I"%si%um" -I"%si%shared"
    SET sl=%sdk_dir%lib\%sdk_ver%
@@ -130,6 +131,7 @@ REM ================================================================
 
    SET sdk_dir=%WindowsSdkDir%
    SET sdk_ver=%WindowsSDKVersion%
+   SET sdk_ver_bin_dir=%WindowsSdkVerBinPath%bin\amd64
    SET si=%sdk_dir%Include\%sdk_ver%
    SET sdk_includes=-I"%si%ucrt" -I"%si%um" -I"%si%shared" -I"%si%winrt"
    SET sl=%sdk_dir%lib\%sdk_ver%
@@ -160,6 +162,11 @@ REM ================================================================
    echo msvc_includes=%msvc_includes%
    echo msvc_libs=%msvc_libs%
 
+   echo sdk_ver_bin_dir=%sdk_ver_bin_dir%
+   SET X1=%sdk_ver_bin_dir:C:=/C%
+   SET X2=%X1:\=/%
+   echo sdk_ver_bin_dir_msys=%X2%
+
    echo sdk_includes=%sdk_includes%
    echo sdk_libs=%sdk_libs%
 
diff --git a/compat/vcbuild/scripts/clink.pl b/compat/vcbuild/scripts/clink.pl
index 3bd824154b..677d44e46f 100755
--- a/compat/vcbuild/scripts/clink.pl
+++ b/compat/vcbuild/scripts/clink.pl
@@ -15,6 +15,7 @@ my @cflags = ();
 my @lflags = ();
 my $is_linking = 0;
 my $is_debug = 0;
+my $is_gui = 0;
 while (@ARGV) {
 	my $arg = shift @ARGV;
 	if ("$arg" eq "-DDEBUG") {
@@ -56,7 +57,8 @@ while (@ARGV) {
 		# need to use that instead?
 		foreach my $flag (@lflags) {
 			if ($flag =~ /^-LIBPATH:(.*)/) {
-				foreach my $l ("libcurl_imp.lib", "libcurl.lib") {
+				my $libcurl = $is_debug ? "libcurl-d.lib" : "libcurl.lib";
+				foreach my $l ("libcurl_imp.lib", $libcurl) {
 					if (-f "$1/$l") {
 						$lib = $l;
 						last;
@@ -66,7 +68,11 @@ while (@ARGV) {
 		}
 		push(@args, $lib);
 	} elsif ("$arg" eq "-lexpat") {
+	    if ($is_debug) {
+		push(@args, "libexpatd.lib");
+	    } else {
 		push(@args, "libexpat.lib");
+	    }
 	} elsif ("$arg" =~ /^-L/ && "$arg" ne "-LTCG") {
 		$arg =~ s/^-L/-LIBPATH:/;
 		push(@lflags, $arg);
@@ -118,11 +124,23 @@ while (@ARGV) {
 		push(@cflags, "-wd4996");
 	} elsif ("$arg" =~ /^-W[a-z]/) {
 		# let's ignore those
+	} elsif ("$arg" eq "-fno-stack-protector") {
+		# eat this
+	} elsif ("$arg" eq "-mwindows") {
+		$is_gui = 1;
 	} else {
 		push(@args, $arg);
 	}
 }
 if ($is_linking) {
+	if ($is_gui) {
+		push(@args, "-ENTRY:wWinMainCRTStartup");
+		push(@args, "-SUBSYSTEM:WINDOWS");
+	} else {
+		push(@args, "-ENTRY:wmainCRTStartup");
+		push(@args, "-SUBSYSTEM:CONSOLE");
+	}
+
 	push(@args, @lflags);
 	unshift(@args, "link.exe");
 } else {
diff --git a/compat/vcbuild/scripts/rc.pl b/compat/vcbuild/scripts/rc.pl
new file mode 100644
index 0000000000..7bca4cd81c
--- /dev/null
+++ b/compat/vcbuild/scripts/rc.pl
@@ -0,0 +1,46 @@
+#!/usr/bin/perl -w
+######################################################################
+# Compile Resources on Windows
+#
+# This is a wrapper to facilitate the compilation of Git with MSVC
+# using GNU Make as the build system. So, instead of manipulating the
+# Makefile into something nasty, just to support non-space arguments
+# etc, we use this wrapper to fix the command line options
+#
+######################################################################
+use strict;
+my @args = ();
+my @input = ();
+
+while (@ARGV) {
+	my $arg = shift @ARGV;
+	if ("$arg" =~ /^-[dD]/) {
+		# GIT_VERSION gets passed with too many
+		# layers of dquote escaping.
+		$arg =~ s/\\"/"/g;
+
+		push(@args, $arg);
+
+	} elsif ("$arg" eq "-i") {
+		my $arg = shift @ARGV;
+		# TODO complain if NULL or is dashed ??
+		push(@input, $arg);
+
+	} elsif ("$arg" eq "-o") {
+		my $arg = shift @ARGV;
+		# TODO complain if NULL or is dashed ??
+		push(@args, "-fo$arg");
+
+	} else {
+		push(@args, $arg);
+	}
+}
+
+push(@args, "-nologo");
+push(@args, "-v");
+push(@args, @input);
+
+unshift(@args, "rc.exe");
+printf("**** @args\n");
+
+exit (system(@args) != 0);
diff --git a/compat/vcbuild/vcpkg_copy_dlls.bat b/compat/vcbuild/vcpkg_copy_dlls.bat
index 13661c14f8..8bea0cbf83 100644
--- a/compat/vcbuild/vcpkg_copy_dlls.bat
+++ b/compat/vcbuild/vcpkg_copy_dlls.bat
@@ -15,7 +15,12 @@ REM ================================================================
 	@FOR /F "delims=" %%D IN ("%~dp0") DO @SET cwd=%%~fD
 	cd %cwd%
 
-	SET arch=x64-windows
+	SET arch=%2
+	IF NOT DEFINED arch (
+		echo defaulting to 'x64-windows`. Invoke %0 with 'x86-windows', 'x64-windows', or 'arm64-windows'
+		set arch=x64-windows
+	)
+
 	SET inst=%cwd%vcpkg\installed\%arch%
 
 	IF [%1]==[release] (
diff --git a/compat/vcbuild/vcpkg_install.bat b/compat/vcbuild/vcpkg_install.bat
index ebd0bad242..575c65c20b 100644
--- a/compat/vcbuild/vcpkg_install.bat
+++ b/compat/vcbuild/vcpkg_install.bat
@@ -31,11 +31,24 @@ REM ================================================================
 
 	SETLOCAL EnableDelayedExpansion
 
+	SET arch=%1
+	IF NOT DEFINED arch (
+		echo defaulting to 'x64-windows`. Invoke %0 with 'x86-windows', 'x64-windows', or 'arm64-windows'
+		set arch=x64-windows
+	)
+
 	@FOR /F "delims=" %%D IN ("%~dp0") DO @SET cwd=%%~fD
 	cd %cwd%
 
 	dir vcpkg\vcpkg.exe >nul 2>nul && GOTO :install_libraries
 
+	git.exe version 2>nul
+	IF ERRORLEVEL 1 (
+	echo "***"
+	echo "Git not found. Please adjust your CMD path or Git install option."
+	echo "***"
+	EXIT /B 1 )
+
 	echo Fetching vcpkg in %cwd%vcpkg
 	git.exe clone https://github.com/Microsoft/vcpkg vcpkg
 	IF ERRORLEVEL 1 ( EXIT /B 1 )
@@ -48,9 +61,8 @@ REM ================================================================
 	echo Successfully installed %cwd%vcpkg\vcpkg.exe
 
 :install_libraries
-	SET arch=x64-windows
 
-	echo Installing third-party libraries...
+	echo Installing third-party libraries(%arch%)...
 	FOR %%i IN (zlib expat libiconv openssl libssh2 curl) DO (
 	    cd %cwd%vcpkg
 	    IF NOT EXIST "packages\%%i_%arch%" CALL :sub__install_one %%i
@@ -73,8 +85,47 @@ REM ================================================================
 :sub__install_one
 	echo     Installing package %1...
 
-	.\vcpkg.exe install %1:%arch%
+	call :%1_features
+
+	REM vcpkg may not be reliable on slow, intermittent or proxy
+	REM connections, see e.g.
+	REM https://social.msdn.microsoft.com/Forums/windowsdesktop/en-US/4a8f7be5-5e15-4213-a7bb-ddf424a954e6/winhttpsendrequest-ends-with-12002-errorhttptimeout-after-21-seconds-no-matter-what-timeout?forum=windowssdk
+	REM which explains the hidden 21 second timeout
+	REM (last post by Dave : Microsoft - Windows Networking team)
+
+	.\vcpkg.exe install %1%features%:%arch%
 	IF ERRORLEVEL 1 ( EXIT /B 1 )
 
 	echo     Finished %1
 	goto :EOF
+
+::
+:: features for each vcpkg to install
+:: there should be an entry here for each package to install
+:: 'set features=' means use the default otherwise
+:: 'set features=[comma-delimited-feature-set]' is the syntax
+::
+
+:zlib_features
+set features=
+goto :EOF
+
+:expat_features
+set features=
+goto :EOF
+
+:libiconv_features
+set features=
+goto :EOF
+
+:openssl_features
+set features=
+goto :EOF
+
+:libssh2_features
+set features=
+goto :EOF
+
+:curl_features
+set features=[core,openssl,schannel]
+goto :EOF
diff --git a/compat/win32/pthread.c b/compat/win32/pthread.c
index 85f8f7920c..2f35b60324 100644
--- a/compat/win32/pthread.c
+++ b/compat/win32/pthread.c
@@ -21,8 +21,8 @@ static unsigned __stdcall win32_start_routine(void *arg)
 	return 0;
 }
 
-int pthread_create(pthread_t *thread, const void *unused,
-		   void *(*start_routine)(void *), void *arg)
+int win32_pthread_create(pthread_t *thread, const void *unused,
+			 void *(*start_routine)(void *), void *arg)
 {
 	thread->arg = arg;
 	thread->start_routine = start_routine;
@@ -53,7 +53,7 @@ int win32_pthread_join(pthread_t *thread, void **value_ptr)
 	}
 }
 
-pthread_t pthread_self(void)
+pthread_t win32_pthread_self(void)
 {
 	pthread_t t = { NULL };
 	t.tid = GetCurrentThreadId();
diff --git a/compat/win32/pthread.h b/compat/win32/pthread.h
index cc3221cb2c..d0061ecf33 100644
--- a/compat/win32/pthread.h
+++ b/compat/win32/pthread.h
@@ -50,8 +50,9 @@ typedef struct {
 	DWORD tid;
 } pthread_t;
 
-int pthread_create(pthread_t *thread, const void *unused,
-		   void *(*start_routine)(void*), void *arg);
+int win32_pthread_create(pthread_t *thread, const void *unused,
+			 void *(*start_routine)(void*), void *arg);
+#define pthread_create win32_pthread_create
 
 /*
  * To avoid the need of copying a struct, we use small macro wrapper to pass
@@ -62,7 +63,8 @@ int pthread_create(pthread_t *thread, const void *unused,
 int win32_pthread_join(pthread_t *thread, void **value_ptr);
 
 #define pthread_equal(t1, t2) ((t1).tid == (t2).tid)
-pthread_t pthread_self(void);
+pthread_t win32_pthread_self(void);
+#define pthread_self win32_pthread_self
 
 static inline void NORETURN pthread_exit(void *ret)
 {
diff --git a/compat/winansi.c b/compat/winansi.c
index f83610f684..acc68185f4 100644
--- a/compat/winansi.c
+++ b/compat/winansi.c
@@ -573,6 +573,9 @@ static void detect_msys_tty(int fd)
 	if (!NT_SUCCESS(NtQueryObject(h, ObjectNameInformation,
 			buffer, sizeof(buffer) - 2, &result)))
 		return;
+	if (result < sizeof(*nameinfo) || !nameinfo->Name.Buffer ||
+		!nameinfo->Name.Length)
+		return;
 	name = nameinfo->Name.Buffer;
 	name[nameinfo->Name.Length / sizeof(*name)] = 0;
 
diff --git a/config.c b/config.c
index b330c7adb4..df6b6bcb3d 100644
--- a/config.c
+++ b/config.c
@@ -1373,8 +1373,8 @@ int git_config_color(char *dest, const char *var, const char *value)
 	return 0;
 }
 
-static int git_default_core_config(const char *var, const char *value,
-				   const struct config_context *ctx, void *cb)
+int git_default_core_config(const char *var, const char *value,
+			    const struct config_context *ctx, void *cb)
 {
 	/* This needs a better name */
 	if (!strcmp(var, "core.filemode")) {
diff --git a/config.h b/config.h
index 14f881ecfa..bfc1873b26 100644
--- a/config.h
+++ b/config.h
@@ -167,6 +167,8 @@ typedef int (*config_fn_t)(const char *, const char *,
 
 int git_default_config(const char *, const char *,
 		       const struct config_context *, void *);
+int git_default_core_config(const char *var, const char *value,
+			    const struct config_context *ctx, void *cb);
 
 /**
  * Read a specific file in git-config format.
diff --git a/config.mak.dev b/config.mak.dev
index 981304727c..6d888c146c 100644
--- a/config.mak.dev
+++ b/config.mak.dev
@@ -22,8 +22,10 @@ endif
 
 ifneq ($(uname_S),FreeBSD)
 ifneq ($(or $(filter gcc6,$(COMPILER_FEATURES)),$(filter clang7,$(COMPILER_FEATURES))),)
+ifndef USE_MIMALLOC
 DEVELOPER_CFLAGS += -std=gnu99
 endif
+endif
 else
 # FreeBSD cannot limit to C99 because its system headers unconditionally
 # rely on C11 features.
diff --git a/config.mak.uname b/config.mak.uname
index 3bb03f423a..e917de125e 100644
--- a/config.mak.uname
+++ b/config.mak.uname
@@ -426,7 +426,7 @@ ifeq ($(uname_S),Windows)
 	# link.exe next to, and required by, cl.exe, we have to prepend this
 	# onto the existing $PATH.
 	#
-	SANE_TOOL_PATH ?= $(msvc_bin_dir_msys)
+	SANE_TOOL_PATH ?= $(msvc_bin_dir_msys):$(sdk_ver_bin_dir_msys)
 	HAVE_ALLOCA_H = YesPlease
 	NO_PREAD = YesPlease
 	NEEDS_CRYPTO_WITH_SSL = YesPlease
@@ -472,6 +472,7 @@ ifeq ($(uname_S),Windows)
 	NO_POSIX_GOODIES = UnfortunatelyYes
 	NATIVE_CRLF = YesPlease
 	DEFAULT_HELP_FORMAT = html
+	SKIP_DASHED_BUILT_INS = YabbaDabbaDoo
 ifeq (/mingw64,$(subst 32,64,$(prefix)))
 	# Move system config into top-level /etc/
 	ETC_GITCONFIG = ../etc/gitconfig
@@ -481,20 +482,22 @@ endif
 	CC = compat/vcbuild/scripts/clink.pl
 	AR = compat/vcbuild/scripts/lib.pl
 	CFLAGS =
-	BASIC_CFLAGS = -nologo -I. -Icompat/vcbuild/include -DWIN32 -D_CONSOLE -DHAVE_STRING_H -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE
+	BASIC_CFLAGS = -nologo -I. -Icompat/vcbuild/include -DWIN32 -D_CONSOLE -DHAVE_STRING_H -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE -MP -std:c11
 	COMPAT_OBJS = compat/msvc.o compat/winansi.o \
 		compat/win32/flush.o \
 		compat/win32/path-utils.o \
 		compat/win32/pthread.o compat/win32/syslog.o \
 		compat/win32/trace2_win32_process_info.o \
 		compat/win32/dirent.o
-	COMPAT_CFLAGS = -D__USE_MINGW_ACCESS -DDETECT_MSYS_TTY -DNOGDI -DHAVE_STRING_H -Icompat -Icompat/regex -Icompat/win32 -DSTRIP_EXTENSION=\".exe\"
-	BASIC_LDFLAGS = -IGNORE:4217 -IGNORE:4049 -NOLOGO -ENTRY:wmainCRTStartup -SUBSYSTEM:CONSOLE
+	COMPAT_CFLAGS = -D__USE_MINGW_ACCESS -DDETECT_MSYS_TTY -DENSURE_MSYSTEM_IS_SET -DNOGDI -DHAVE_STRING_H -Icompat -Icompat/regex -Icompat/win32 -DSTRIP_EXTENSION=\".exe\"
+	BASIC_LDFLAGS = -IGNORE:4217 -IGNORE:4049 -NOLOGO
 	# invalidcontinue.obj allows Git's source code to close the same file
 	# handle twice, or to access the osfhandle of an already-closed stdout
 	# See https://msdn.microsoft.com/en-us/library/ms235330.aspx
 	EXTLIBS = user32.lib advapi32.lib shell32.lib wininet.lib ws2_32.lib invalidcontinue.obj kernel32.lib ntdll.lib
+	GITLIBS += git.res
 	PTHREAD_LIBS =
+	RC = compat/vcbuild/scripts/rc.pl
 	lib =
 	BASIC_CFLAGS += $(vcpkg_inc) $(sdk_includes) $(msvc_includes)
 ifndef DEBUG
@@ -652,6 +655,7 @@ ifeq ($(uname_S),MINGW)
 	FSMONITOR_DAEMON_BACKEND = win32
 	FSMONITOR_OS_SETTINGS = win32
 
+	SKIP_DASHED_BUILT_INS = YabbaDabbaDoo
 	RUNTIME_PREFIX = YesPlease
 	HAVE_WPGMPTR = YesWeDo
 	NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease
@@ -666,7 +670,8 @@ ifeq ($(uname_S),MINGW)
 	DEFAULT_HELP_FORMAT = html
 	HAVE_PLATFORM_PROCINFO = YesPlease
 	CSPRNG_METHOD = rtlgenrandom
-	BASIC_LDFLAGS += -municode
+	BASIC_LDFLAGS += -municode -Wl,--tsaware
+	LAZYLOAD_LIBCURL = YesDoThatPlease
 	COMPAT_CFLAGS += -DNOGDI -Icompat -Icompat/win32
 	COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\"
 	COMPAT_OBJS += compat/mingw.o compat/winansi.o \
@@ -699,13 +704,17 @@ ifeq ($(uname_S),MINGW)
 		prefix = /mingw64
 		HOST_CPU = x86_64
 		BASIC_LDFLAGS += -Wl,--pic-executable,-e,mainCRTStartup
+	else ifeq (CLANGARM64,$(MSYSTEM))
+		prefix = /clangarm64
+		HOST_CPU = aarch64
+		BASIC_LDFLAGS += -Wl,--pic-executable,-e,mainCRTStartup
 	else
 		COMPAT_CFLAGS += -D_USE_32BIT_TIME_T
 		BASIC_LDFLAGS += -Wl,--large-address-aware
 	endif
 	CC = gcc
 	COMPAT_CFLAGS += -D__USE_MINGW_ANSI_STDIO=0 -DDETECT_MSYS_TTY \
-		-fstack-protector-strong
+		-DENSURE_MSYSTEM_IS_SET -fstack-protector-strong
 	EXTLIBS += -lntdll
 	EXTRA_PROGRAMS += headless-git$X
 	INSTALL = /bin/install
@@ -713,7 +722,8 @@ ifeq ($(uname_S),MINGW)
 	HAVE_LIBCHARSET_H = YesPlease
 	USE_GETTEXT_SCHEME = fallthrough
 	USE_LIBPCRE = YesPlease
-	USE_NED_ALLOCATOR = YesPlease
+	USE_MIMALLOC = YesPlease
+	NO_PYTHON =
 	ifeq (/mingw64,$(subst 32,64,$(prefix)))
 		# Move system config into top-level /etc/
 		ETC_GITCONFIG = ../etc/gitconfig
@@ -743,7 +753,7 @@ vcxproj:
 
 	# Make .vcxproj files and add them
 	perl contrib/buildsystems/generate -g Vcxproj
-	git add -f git.sln {*,*/lib,t/helper/*}/*.vcxproj
+	git add -f git.sln {*,*/lib.proj,t/helper/*,reftable/libreftable{,_test}.proj}/*.vcxproj
 
 	# Generate the LinkOrCopyBuiltins.targets and LinkOrCopyRemoteHttp.targets file
 	(echo '<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">' && \
@@ -753,7 +763,7 @@ vcxproj:
 	   echo '    <Copy SourceFiles="$$(OutDir)\git.exe" DestinationFiles="$$(OutDir)\'"$$name"'" SkipUnchangedFiles="true" UseHardlinksIfPossible="true" />'; \
 	 done && \
 	 echo '  </Target>' && \
-	 echo '</Project>') >git/LinkOrCopyBuiltins.targets
+	 echo '</Project>') >git.proj/LinkOrCopyBuiltins.targets
 	(echo '<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">' && \
 	 echo '  <Target Name="CopyBuiltins_AfterBuild" AfterTargets="AfterBuild">' && \
 	 for name in $(REMOTE_CURL_ALIASES); \
@@ -761,8 +771,8 @@ vcxproj:
 	   echo '    <Copy SourceFiles="$$(OutDir)\'"$(REMOTE_CURL_PRIMARY)"'" DestinationFiles="$$(OutDir)\'"$$name"'" SkipUnchangedFiles="true" UseHardlinksIfPossible="true" />'; \
 	 done && \
 	 echo '  </Target>' && \
-	 echo '</Project>') >git-remote-http/LinkOrCopyRemoteHttp.targets
-	git add -f git/LinkOrCopyBuiltins.targets git-remote-http/LinkOrCopyRemoteHttp.targets
+	 echo '</Project>') >git-remote-http.proj/LinkOrCopyRemoteHttp.targets
+	git add -f git.proj/LinkOrCopyBuiltins.targets git-remote-http.proj/LinkOrCopyRemoteHttp.targets
 
 	# Add generated headers
 	$(MAKE) MSVC=1 SKIP_VCPKG=1 prefix=/mingw64 $(GENERATED_H)
@@ -775,9 +785,11 @@ vcxproj:
 	sed -i '/^git_broken_path_fix ".*/d' git-sh-setup
 	git add -f $(SCRIPT_LIB) $(SCRIPTS)
 
+ifndef NO_PERL
 	# Add Perl module
 	$(MAKE) $(LIB_PERL_GEN)
 	git add -f perl/build
+endif
 
 	# Add bin-wrappers, for testing
 	rm -rf bin-wrappers/
diff --git a/contrib/buildsystems/CMakeLists.txt b/contrib/buildsystems/CMakeLists.txt
index 6b819e2fbd..4f25d3fa4d 100644
--- a/contrib/buildsystems/CMakeLists.txt
+++ b/contrib/buildsystems/CMakeLists.txt
@@ -14,6 +14,11 @@ Note: Visual Studio also has the option of opening `CMakeLists.txt`
 directly; Using this option, Visual Studio will not find the source code,
 though, therefore the `File>Open>Folder...` option is preferred.
 
+Visual Studio does not produce a .sln solution file nor the .vcxproj files
+that may be required by VS extension tools.
+
+To generate the .sln/.vcxproj files run CMake manually, as described below.
+
 Instructions to run CMake manually:
 
     mkdir -p contrib/buildsystems/out
@@ -22,7 +27,7 @@ Instructions to run CMake manually:
 
 This will build the git binaries in contrib/buildsystems/out
 directory (our top-level .gitignore file knows to ignore contents of
-this directory).
+this directory). The project .sln and .vcxproj files are also generated.
 
 Possible build configurations(-DCMAKE_BUILD_TYPE) with corresponding
 compiler flags
@@ -35,17 +40,16 @@ empty(default) :
 NOTE: -DCMAKE_BUILD_TYPE is optional. For multi-config generators like Visual Studio
 this option is ignored
 
-This process generates a Makefile(Linux/*BSD/MacOS) , Visual Studio solution(Windows) by default.
+This process generates a Makefile(Linux/*BSD/MacOS), Visual Studio solution(Windows) by default.
 Run `make` to build Git on Linux/*BSD/MacOS.
 Open git.sln on Windows and build Git.
 
-NOTE: By default CMake uses Makefile as the build tool on Linux and Visual Studio in Windows,
-to use another tool say `ninja` add this to the command line when configuring.
-`-G Ninja`
-
 NOTE: By default CMake will install vcpkg locally to your source tree on configuration,
 to avoid this, add `-DNO_VCPKG=TRUE` to the command line when configuring.
 
+The Visual Studio default generator changed in v16.6 from its Visual Studio
+implemenation to `Ninja` This required changes to many CMake scripts.
+
 ]]
 cmake_minimum_required(VERSION 3.14)
 
@@ -59,15 +63,29 @@ endif()
 
 if(NOT DEFINED CMAKE_EXPORT_COMPILE_COMMANDS)
 	set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE)
+	message("settting CMAKE_EXPORT_COMPILE_COMMANDS: ${CMAKE_EXPORT_COMPILE_COMMANDS}")
 endif()
 
 if(USE_VCPKG)
 	set(VCPKG_DIR "${CMAKE_SOURCE_DIR}/compat/vcbuild/vcpkg")
+	message("WIN32: ${WIN32}") # show its underlying text values
+	message("VCPKG_DIR: ${VCPKG_DIR}")
+	message("VCPKG_ARCH: ${VCPKG_ARCH}") # maybe unset
+	message("MSVC: ${MSVC}")
+	message("CMAKE_GENERATOR: ${CMAKE_GENERATOR}")
+	message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+	message("CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
+	message("CMAKE_EXPORT_COMPILE_COMMANDS: ${CMAKE_EXPORT_COMPILE_COMMANDS}")
+	message("ENV(CMAKE_EXPORT_COMPILE_COMMANDS): $ENV{CMAKE_EXPORT_COMPILE_COMMANDS}")
 	if(NOT EXISTS ${VCPKG_DIR})
 		message("Initializing vcpkg and building the Git's dependencies (this will take a while...)")
-		execute_process(COMMAND ${CMAKE_SOURCE_DIR}/compat/vcbuild/vcpkg_install.bat)
+		execute_process(COMMAND ${CMAKE_SOURCE_DIR}/compat/vcbuild/vcpkg_install.bat ${VCPKG_ARCH})
 	endif()
-	list(APPEND CMAKE_PREFIX_PATH "${VCPKG_DIR}/installed/x64-windows")
+	if(NOT EXISTS ${VCPKG_ARCH})
+		message("VCPKG_ARCH: unset, using 'x64-windows'")
+		set(VCPKG_ARCH "x64-windows") # default from vcpkg_install.bat
+	endif()
+	list(APPEND CMAKE_PREFIX_PATH "${VCPKG_DIR}/installed/${VCPKG_ARCH}")
 
 	# In the vcpkg edition, we need this to be able to link to libcurl
 	set(CURL_NO_CURL_CMAKE ON)
@@ -223,7 +241,14 @@ endif()
 
 #default behaviour
 include_directories(${CMAKE_SOURCE_DIR})
-add_compile_definitions(GIT_HOST_CPU="${CMAKE_SYSTEM_PROCESSOR}")
+
+# When cross-compiling, define HOST_CPU as the canonical name of the CPU on
+# which the built Git will run (for instance "x86_64").
+if(NOT HOST_CPU)
+	add_compile_definitions(GIT_HOST_CPU="${CMAKE_SYSTEM_PROCESSOR}")
+else()
+	add_compile_definitions(GIT_HOST_CPU="${HOST_CPU}")
+endif()
 add_compile_definitions(SHA256_BLK INTERNAL_QSORT RUNTIME_PREFIX)
 add_compile_definitions(NO_OPENSSL SHA1_DC SHA1DC_NO_STANDARD_INCLUDES
 			SHA1DC_INIT_SAFE_HASH_DEFAULT=0
@@ -740,6 +765,7 @@ if(WIN32)
 	endif()
 
 	add_executable(headless-git ${CMAKE_SOURCE_DIR}/compat/win32/headless.c)
+	list(APPEND PROGRAMS_BUILT headless-git)
 	if(CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
 		target_link_options(headless-git PUBLIC -municode -Wl,-subsystem,windows)
 	elseif(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
@@ -919,7 +945,7 @@ list(TRANSFORM git_perl_scripts PREPEND "${CMAKE_BINARY_DIR}/")
 
 #install
 foreach(program ${PROGRAMS_BUILT})
-if(program MATCHES "^(git|git-shell|scalar)$")
+if(program MATCHES "^(git|git-shell|headless-git|scalar)$")
 install(TARGETS ${program}
 	RUNTIME DESTINATION bin)
 else()
@@ -1075,7 +1101,7 @@ file(APPEND ${CMAKE_BINARY_DIR}/GIT-BUILD-OPTIONS "RUNTIME_PREFIX='${RUNTIME_PRE
 file(APPEND ${CMAKE_BINARY_DIR}/GIT-BUILD-OPTIONS "NO_PYTHON='${NO_PYTHON}'\n")
 file(APPEND ${CMAKE_BINARY_DIR}/GIT-BUILD-OPTIONS "SUPPORTS_SIMPLE_IPC='${SUPPORTS_SIMPLE_IPC}'\n")
 if(USE_VCPKG)
-	file(APPEND ${CMAKE_BINARY_DIR}/GIT-BUILD-OPTIONS "PATH=\"$PATH:$TEST_DIRECTORY/../compat/vcbuild/vcpkg/installed/x64-windows/bin\"\n")
+	file(APPEND ${CMAKE_BINARY_DIR}/GIT-BUILD-OPTIONS "PATH=\"$PATH:$TEST_DIRECTORY/../compat/vcbuild/vcpkg/installed/${VCPKG_ARCH}/bin\"\n")
 endif()
 
 #Make the tests work when building out of the source tree
diff --git a/contrib/buildsystems/Generators/Vcxproj.pm b/contrib/buildsystems/Generators/Vcxproj.pm
index b2e68a1671..52355932d2 100644
--- a/contrib/buildsystems/Generators/Vcxproj.pm
+++ b/contrib/buildsystems/Generators/Vcxproj.pm
@@ -58,8 +58,8 @@ sub createProject {
     my $uuid = generate_guid($name);
     $$build_structure{"$prefix${target}_GUID"} = $uuid;
     my $vcxproj = $target;
-    $vcxproj =~ s/(.*\/)?(.*)/$&\/$2.vcxproj/;
-    $vcxproj =~ s/([^\/]*)(\/lib)\/(lib.vcxproj)/$1$2\/$1_$3/;
+    $vcxproj =~ s/(.*\/)?(.*)/$&.proj\/$2.vcxproj/;
+    $vcxproj =~ s/([^\/]*)(\/lib\.proj)\/(lib.vcxproj)/$1$2\/$1_$3/;
     $$build_structure{"$prefix${target}_VCXPROJ"} = $vcxproj;
 
     my @srcs = sort(map("$rel_dir\\$_", @{$$build_structure{"$prefix${name}_SOURCES"}}));
@@ -77,7 +77,7 @@ sub createProject {
     my $libs_release = "\n    ";
     my $libs_debug = "\n    ";
     if (!$static_library && $name ne 'headless-git') {
-      $libs_release = join(";", sort(grep /^(?!libgit\.lib|xdiff\/lib\.lib|vcs-svn\/lib\.lib|reftable\/libreftable\.lib)/, @{$$build_structure{"$prefix${name}_LIBS"}}));
+      $libs_release = join(";", sort(grep /^(?!libgit\.lib|xdiff\/lib\.lib|vcs-svn\/lib\.lib|reftable\/libreftable(_test)?\.lib)/, @{$$build_structure{"$prefix${name}_LIBS"}}));
       $libs_debug = $libs_release;
       $libs_debug =~ s/zlib\.lib/zlibd\.lib/g;
       $libs_debug =~ s/libexpat\.lib/libexpatd\.lib/g;
@@ -88,8 +88,21 @@ sub createProject {
     $defines =~ s/</&lt;/g;
     $defines =~ s/>/&gt;/g;
     $defines =~ s/\'//g;
+    $defines =~ s/\\"/"/g;
 
-    die "Could not create the directory $target for $label project!\n" unless (-d "$target" || mkdir "$target");
+    my $rcdefines = $defines;
+    $rcdefines =~ s/(?<!\\)"/\\$&/g;
+
+    my $entrypoint = 'wmainCRTStartup';
+    my $subsystem = 'Console';
+    if (grep /^-mwindows$/, @{$$build_structure{"$prefix${name}_LFLAGS"}}) {
+        $entrypoint = 'wWinMainCRTStartup';
+        $subsystem = 'Windows';
+    }
+
+    my $dir = $vcxproj;
+    $dir =~ s/\/[^\/]*$//;
+    die "Could not create the directory $dir for $label project!\n" unless (-d "$dir" || mkdir "$dir");
 
     open F, ">$vcxproj" or die "Could not open $vcxproj for writing!\n";
     binmode F, ":crlf :utf8";
@@ -114,12 +127,21 @@ sub createProject {
       <Configuration>Release</Configuration>
       <Platform>x64</Platform>
     </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|ARM64">
+      <Configuration>Debug</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|ARM64">
+      <Configuration>Release</Configuration>
+      <Platform>ARM64</Platform>
+    </ProjectConfiguration>
   </ItemGroup>
   <PropertyGroup Label="Globals">
     <ProjectGuid>$uuid</ProjectGuid>
     <Keyword>Win32Proj</Keyword>
     <VCPKGArch Condition="'\$(Platform)'=='Win32'">x86-windows</VCPKGArch>
-    <VCPKGArch Condition="'\$(Platform)'!='Win32'">x64-windows</VCPKGArch>
+    <VCPKGArch Condition="'\$(Platform)'=='x64'">x64-windows</VCPKGArch>
+    <VCPKGArch Condition="'\$(Platform)'=='ARM64'">arm64-windows</VCPKGArch>
     <VCPKGArchDirectory>$cdup\\compat\\vcbuild\\vcpkg\\installed\\\$(VCPKGArch)</VCPKGArchDirectory>
     <VCPKGBinDirectory Condition="'\$(Configuration)'=='Debug'">\$(VCPKGArchDirectory)\\debug\\bin</VCPKGBinDirectory>
     <VCPKGLibDirectory Condition="'\$(Configuration)'=='Debug'">\$(VCPKGArchDirectory)\\debug\\lib</VCPKGLibDirectory>
@@ -140,7 +162,7 @@ sub createProject {
   </PropertyGroup>
   <PropertyGroup>
     <ConfigurationType>$config_type</ConfigurationType>
-    <PlatformToolset>v140</PlatformToolset>
+    <PlatformToolset>v142</PlatformToolset>
     <!-- <CharacterSet>UTF-8</CharacterSet> -->
     <OutDir>..\\</OutDir>
     <!-- <IntDir>\$(ProjectDir)\$(Configuration)\\</IntDir> -->
@@ -166,6 +188,7 @@ sub createProject {
       <InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
       <PrecompiledHeader />
       <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <LanguageStandard_C>stdc11</LanguageStandard_C>
     </ClCompile>
     <Lib>
       <SuppressStartupBanner>true</SuppressStartupBanner>
@@ -174,9 +197,9 @@ sub createProject {
       <AdditionalLibraryDirectories>\$(VCPKGLibDirectory);%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>\$(VCPKGLibs);\$(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalOptions>invalidcontinue.obj %(AdditionalOptions)</AdditionalOptions>
-      <EntryPointSymbol>wmainCRTStartup</EntryPointSymbol>
+      <EntryPointSymbol>$entrypoint</EntryPointSymbol>
       <ManifestFile>$cdup\\compat\\win32\\git.manifest</ManifestFile>
-      <SubSystem>Console</SubSystem>
+      <SubSystem>$subsystem</SubSystem>
     </Link>
 EOM
     if ($target eq 'libgit') {
@@ -184,7 +207,7 @@ EOM
     <PreBuildEvent Condition="!Exists('$cdup\\compat\\vcbuild\\vcpkg\\installed\\\$(VCPKGArch)\\include\\openssl\\ssl.h')">
       <Message>Initialize VCPKG</Message>
       <Command>del "$cdup\\compat\\vcbuild\\vcpkg"</Command>
-      <Command>call "$cdup\\compat\\vcbuild\\vcpkg_install.bat"</Command>
+      <Command>call "$cdup\\compat\\vcbuild\\vcpkg_install.bat" \$(VCPKGArch)</Command>
     </PreBuildEvent>
 EOM
     }
@@ -201,6 +224,9 @@ EOM
       <PreprocessorDefinitions>WIN32;_DEBUG;$defines;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
     </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>WIN32;_DEBUG;$rcdefines;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ResourceCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
     </Link>
@@ -214,6 +240,9 @@ EOM
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
     </ClCompile>
+    <ResourceCompile>
+      <PreprocessorDefinitions>WIN32;NDEBUG;$rcdefines;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ResourceCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
@@ -223,9 +252,15 @@ EOM
   <ItemGroup>
 EOM
     foreach(@sources) {
-        print F << "EOM";
+        if (/\.rc$/) {
+            print F << "EOM";
+    <ResourceCompile Include="$_" />
+EOM
+        } else {
+            print F << "EOM";
     <ClCompile Include="$_" />
 EOM
+        }
     }
     print F << "EOM";
   </ItemGroup>
@@ -233,26 +268,31 @@ EOM
     if ((!$static_library || $target =~ 'vcs-svn' || $target =~ 'xdiff') && !($name =~ /headless-git/)) {
       my $uuid_libgit = $$build_structure{"LIBS_libgit_GUID"};
       my $uuid_libreftable = $$build_structure{"LIBS_reftable/libreftable_GUID"};
+      my $uuid_libreftable_test = $$build_structure{"LIBS_reftable/libreftable_test_GUID"};
       my $uuid_xdiff_lib = $$build_structure{"LIBS_xdiff/lib_GUID"};
 
       print F << "EOM";
   <ItemGroup>
-    <ProjectReference Include="$cdup\\libgit\\libgit.vcxproj">
+    <ProjectReference Include="$cdup\\libgit.proj\\libgit.vcxproj">
       <Project>$uuid_libgit</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
 EOM
       if (!($name =~ /xdiff|libreftable/)) {
         print F << "EOM";
-    <ProjectReference Include="$cdup\\reftable\\libreftable\\libreftable.vcxproj">
+    <ProjectReference Include="$cdup\\reftable\\libreftable.proj\\libreftable.vcxproj">
       <Project>$uuid_libreftable</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
+    <ProjectReference Include="$cdup\\reftable\\libreftable_test.proj\\libreftable_test.vcxproj">
+      <Project>$uuid_libreftable_test</Project>
+      <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
+    </ProjectReference>
 EOM
       }
       if (!($name =~ 'xdiff')) {
         print F << "EOM";
-    <ProjectReference Include="$cdup\\xdiff\\lib\\xdiff_lib.vcxproj">
+    <ProjectReference Include="$cdup\\xdiff\\lib.proj\\xdiff_lib.vcxproj">
       <Project>$uuid_xdiff_lib</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
@@ -261,7 +301,7 @@ EOM
       if ($name =~ /(test-(line-buffer|svn-fe)|^git-remote-testsvn)\.exe$/) {
         my $uuid_vcs_svn_lib = $$build_structure{"LIBS_vcs-svn/lib_GUID"};
         print F << "EOM";
-    <ProjectReference Include="$cdup\\vcs-svn\\lib\\vcs-svn_lib.vcxproj">
+    <ProjectReference Include="$cdup\\vcs-svn\\lib.proj\\vcs-svn_lib.vcxproj">
       <Project>$uuid_vcs_svn_lib</Project>
       <ReferenceOutputAssembly>false</ReferenceOutputAssembly>
     </ProjectReference>
@@ -338,7 +378,7 @@ sub createGlueProject {
 	my $vcxproj = $build_structure{"APPS_${appname}_VCXPROJ"};
 	$vcxproj =~ s/\//\\/g;
         $appname =~ s/.*\///;
-        print F "\"${appname}\", \"${vcxproj}\", \"${uuid}\"";
+        print F "\"${appname}.proj\", \"${vcxproj}\", \"${uuid}\"";
         print F "$SLN_POST";
     }
     foreach (@libs) {
@@ -348,15 +388,17 @@ sub createGlueProject {
         my $vcxproj = $build_structure{"LIBS_${libname}_VCXPROJ"};
 	$vcxproj =~ s/\//\\/g;
         $libname =~ s/\//_/g;
-        print F "\"${libname}\", \"${vcxproj}\", \"${uuid}\"";
+        print F "\"${libname}.proj\", \"${vcxproj}\", \"${uuid}\"";
         print F "$SLN_POST";
     }
 
     print F << "EOM";
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|ARM64 = Debug|ARM64
 		Debug|x64 = Debug|x64
 		Debug|x86 = Debug|x86
+		Release|ARM64 = Release|ARM64
 		Release|x64 = Release|x64
 		Release|x86 = Release|x86
 	EndGlobalSection
@@ -367,10 +409,14 @@ EOM
     foreach (@apps) {
         my $appname = $_;
         my $uuid = $build_structure{"APPS_${appname}_GUID"};
+        print F "\t\t${uuid}.Debug|ARM64.ActiveCfg = Debug|ARM64\n";
+        print F "\t\t${uuid}.Debug|ARM64.Build.0 = Debug|ARM64\n";
         print F "\t\t${uuid}.Debug|x64.ActiveCfg = Debug|x64\n";
         print F "\t\t${uuid}.Debug|x64.Build.0 = Debug|x64\n";
         print F "\t\t${uuid}.Debug|x86.ActiveCfg = Debug|Win32\n";
         print F "\t\t${uuid}.Debug|x86.Build.0 = Debug|Win32\n";
+        print F "\t\t${uuid}.Release|ARM64.ActiveCfg = Release|ARM64\n";
+        print F "\t\t${uuid}.Release|ARM64.Build.0 = Release|ARM64\n";
         print F "\t\t${uuid}.Release|x64.ActiveCfg = Release|x64\n";
         print F "\t\t${uuid}.Release|x64.Build.0 = Release|x64\n";
         print F "\t\t${uuid}.Release|x86.ActiveCfg = Release|Win32\n";
@@ -379,10 +425,14 @@ EOM
     foreach (@libs) {
         my $libname = $_;
         my $uuid = $build_structure{"LIBS_${libname}_GUID"};
+        print F "\t\t${uuid}.Debug|ARM64.ActiveCfg = Debug|ARM64\n";
+        print F "\t\t${uuid}.Debug|ARM64.Build.0 = Debug|ARM64\n";
         print F "\t\t${uuid}.Debug|x64.ActiveCfg = Debug|x64\n";
         print F "\t\t${uuid}.Debug|x64.Build.0 = Debug|x64\n";
         print F "\t\t${uuid}.Debug|x86.ActiveCfg = Debug|Win32\n";
         print F "\t\t${uuid}.Debug|x86.Build.0 = Debug|Win32\n";
+        print F "\t\t${uuid}.Release|ARM64.ActiveCfg = Release|ARM64\n";
+        print F "\t\t${uuid}.Release|ARM64.Build.0 = Release|ARM64\n";
         print F "\t\t${uuid}.Release|x64.ActiveCfg = Release|x64\n";
         print F "\t\t${uuid}.Release|x64.Build.0 = Release|x64\n";
         print F "\t\t${uuid}.Release|x86.ActiveCfg = Release|Win32\n";
diff --git a/contrib/buildsystems/engine.pl b/contrib/buildsystems/engine.pl
index 069be7e4be..ee4fca200c 100755
--- a/contrib/buildsystems/engine.pl
+++ b/contrib/buildsystems/engine.pl
@@ -165,7 +165,7 @@ sub parseMakeOutput
             next;
         }
 
-        if($text =~ / -c /) {
+        if($text =~ / -c / || $text =~ / -i \S+\.rc /) {
             # compilation
             handleCompileLine($text, $line);
 
@@ -263,16 +263,15 @@ sub handleCompileLine
         if ("$part" eq "-o") {
             # ignore object file
             shift @parts;
-        } elsif ("$part" eq "-c") {
+        } elsif ("$part" eq "-c" || "$part" eq "-i" || "$part" =~ /^-fno-/ || "$part" eq '-pedantic') {
             # ignore compile flag
-        } elsif ("$part" eq "-c") {
         } elsif ($part =~ /^.?-I/) {
             push(@incpaths, $part);
         } elsif ($part =~ /^.?-D/) {
             push(@defines, $part);
         } elsif ($part =~ /^-/) {
             push(@cflags, $part);
-        } elsif ($part =~ /\.(c|cc|cpp)$/) {
+        } elsif ($part =~ /\.(c|cc|cpp|rc)$/) {
             $sourcefile = $part;
         } else {
             die "Unhandled compiler option @ line $lineno: $part";
@@ -359,7 +358,7 @@ sub handleLinkLine
             push(@libs, $part);
         } elsif ($part eq 'invalidcontinue.obj') {
             # ignore - known to MSVC
-        } elsif ($part =~ /\.o$/) {
+        } elsif ($part =~ /\.(o|res)$/) {
             push(@objfiles, $part);
         } elsif ($part =~ /\.obj$/) {
             # do nothing, 'make' should not be producing .obj, only .o files
@@ -373,6 +372,7 @@ sub handleLinkLine
         my $sourcefile = $_;
         $sourcefile =~ s/^headless-git\.o$/compat\/win32\/headless.c/;
         $sourcefile =~ s/\.o$/.c/;
+        $sourcefile =~ s/\.res$/.rc/;
         push(@sources, $sourcefile);
         push(@cflags, @{$compile_options{"${sourcefile}_CFLAGS"}});
         push(@defines, @{$compile_options{"${sourcefile}_DEFINES"}});
diff --git a/contrib/subtree/Makefile b/contrib/subtree/Makefile
index 6fa7496bfd..6f6e90c4cb 100644
--- a/contrib/subtree/Makefile
+++ b/contrib/subtree/Makefile
@@ -94,7 +94,7 @@ $(GIT_SUBTREE_TEST): $(GIT_SUBTREE)
 	cp $< $@
 
 test: $(GIT_SUBTREE_TEST)
-	$(MAKE) -C t/ test
+	$(MAKE) -C t/ all
 
 clean:
 	$(RM) $(GIT_SUBTREE)
diff --git a/git-compat-util.h b/git-compat-util.h
index 3e7a59b5ff..c08f7b64d3 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -191,7 +191,9 @@ struct strbuf;
 #define _ALL_SOURCE 1
 #define _GNU_SOURCE 1
 #define _BSD_SOURCE 1
+#ifndef _DEFAULT_SOURCE
 #define _DEFAULT_SOURCE 1
+#endif
 #define _NETBSD_SOURCE 1
 #define _SGI_SOURCE 1
 
@@ -404,6 +406,16 @@ char *gitdirname(char *);
 # include <sys/sysinfo.h>
 #endif
 
+#ifdef USE_MIMALLOC
+#include "mimalloc.h"
+#define malloc mi_malloc
+#define calloc mi_calloc
+#define realloc mi_realloc
+#define free mi_free
+#define strdup mi_strdup
+#define strndup mi_strndup
+#endif
+
 /* On most systems <netdb.h> would have given us this, but
  * not on some systems (e.g. z/OS).
  */
@@ -581,10 +593,18 @@ static inline int git_has_dir_sep(const char *path)
 #define has_dir_sep(path) git_has_dir_sep(path)
 #endif
 
+#ifndef is_mount_point
+#define is_mount_point is_mount_point_via_stat
+#endif
+
 #ifndef query_user_email
 #define query_user_email() NULL
 #endif
 
+#ifndef platform_strbuf_realpath
+#define platform_strbuf_realpath(resolved, path) NULL
+#endif
+
 #ifdef __TANDEM
 #include <floss.h(floss_execl,floss_execlp,floss_execv,floss_execvp)>
 #include <floss.h(floss_getpwuid)>
diff --git a/git-curl-compat.h b/git-curl-compat.h
index fd96b3cdff..ebfd7d03e9 100644
--- a/git-curl-compat.h
+++ b/git-curl-compat.h
@@ -134,4 +134,12 @@
 #define GIT_CURL_HAVE_CURLOPT_PROTOCOLS_STR 1
 #endif
 
+/**
+ * CURLSSLOPT_AUTO_CLIENT_CERT was added in 7.77.0, released in May
+ * 2021.
+ */
+#if LIBCURL_VERSION_NUM >= 0x074d00
+#define GIT_CURL_HAVE_CURLSSLOPT_AUTO_CLIENT_CERT
+#endif
+
 #endif
diff --git a/git-gui/git-gui.sh b/git-gui/git-gui.sh
index 3e5907a460..438592f5d7 100755
--- a/git-gui/git-gui.sh
+++ b/git-gui/git-gui.sh
@@ -101,6 +101,9 @@ proc _which {what args} {
 
 	if {[is_Windows] && [lsearch -exact $args -script] >= 0} {
 		set suffix {}
+	} elseif {[string match *$_search_exe $what]} {
+		# The search string already has the file extension
+		set suffix {}
 	} else {
 		set suffix $_search_exe
 	}
@@ -2079,6 +2082,7 @@ set all_icons(U$ui_index)   file_merge
 set all_icons(T$ui_index)   file_statechange
 
 set all_icons(_$ui_workdir) file_plain
+set all_icons(A$ui_workdir) file_plain
 set all_icons(M$ui_workdir) file_mod
 set all_icons(D$ui_workdir) file_question
 set all_icons(U$ui_workdir) file_merge
@@ -2105,6 +2109,7 @@ foreach i {
 		{A_ {mc "Staged for commit"}}
 		{AM {mc "Portions staged for commit"}}
 		{AD {mc "Staged for commit, missing"}}
+		{AA {mc "Intended to be added"}}
 
 		{_D {mc "Missing"}}
 		{D_ {mc "Staged for removal"}}
diff --git a/git-gui/lib/diff.tcl b/git-gui/lib/diff.tcl
index 871ad488c2..36d3715f7b 100644
--- a/git-gui/lib/diff.tcl
+++ b/git-gui/lib/diff.tcl
@@ -582,7 +582,8 @@ proc apply_or_revert_hunk {x y revert} {
 	if {$current_diff_side eq $ui_index} {
 		set failed_msg [mc "Failed to unstage selected hunk."]
 		lappend apply_cmd --reverse --cached
-		if {[string index $mi 0] ne {M}} {
+		set file_state [string index $mi 0]
+		if {$file_state ne {M} && $file_state ne {A}} {
 			unlock_index
 			return
 		}
@@ -595,7 +596,8 @@ proc apply_or_revert_hunk {x y revert} {
 			lappend apply_cmd --cached
 		}
 
-		if {[string index $mi 1] ne {M}} {
+		set file_state [string index $mi 1]
+		if {$file_state ne {M} && $file_state ne {A}} {
 			unlock_index
 			return
 		}
@@ -687,7 +689,8 @@ proc apply_or_revert_range_or_line {x y revert} {
 		set failed_msg [mc "Failed to unstage selected line."]
 		set to_context {+}
 		lappend apply_cmd --reverse --cached
-		if {[string index $mi 0] ne {M}} {
+		set file_state [string index $mi 0]
+		if {$file_state ne {M} && $file_state ne {A}} {
 			unlock_index
 			return
 		}
@@ -702,7 +705,8 @@ proc apply_or_revert_range_or_line {x y revert} {
 			lappend apply_cmd --cached
 		}
 
-		if {[string index $mi 1] ne {M}} {
+		set file_state [string index $mi 1]
+		if {$file_state ne {M} && $file_state ne {A}} {
 			unlock_index
 			return
 		}
diff --git a/git.rc b/git.rc
index cc3fdc6cc6..9c08cb214a 100644
--- a/git.rc
+++ b/git.rc
@@ -12,6 +12,7 @@ BEGIN
       VALUE "OriginalFilename", "git.exe\0"
       VALUE "ProductName", "Git\0"
       VALUE "ProductVersion", GIT_VERSION "\0"
+      VALUE "FileVersion", GIT_VERSION "\0"
     END
   END
 
diff --git a/http.c b/http.c
index 8f71bf00d8..239421a983 100644
--- a/http.c
+++ b/http.c
@@ -141,7 +141,13 @@ static char *cached_accept_language;
 
 static char *http_ssl_backend;
 
-static int http_schannel_check_revoke = 1;
+static int http_schannel_check_revoke_mode =
+#ifdef CURLSSLOPT_REVOKE_BEST_EFFORT
+	CURLSSLOPT_REVOKE_BEST_EFFORT;
+#else
+	CURLSSLOPT_NO_REVOKE;
+#endif
+
 /*
  * With the backend being set to `schannel`, setting sslCAinfo would override
  * the Certificate Store in cURL v7.60.0 and later, which is not what we want
@@ -149,6 +155,8 @@ static int http_schannel_check_revoke = 1;
  */
 static int http_schannel_use_ssl_cainfo;
 
+static int http_auto_client_cert;
+
 size_t fread_buffer(char *ptr, size_t eltsize, size_t nmemb, void *buffer_)
 {
 	size_t size = eltsize * nmemb;
@@ -405,7 +413,19 @@ static int http_options(const char *var, const char *value,
 	}
 
 	if (!strcmp("http.schannelcheckrevoke", var)) {
-		http_schannel_check_revoke = git_config_bool(var, value);
+		if (value && !strcmp(value, "best-effort")) {
+			http_schannel_check_revoke_mode =
+#ifdef CURLSSLOPT_REVOKE_BEST_EFFORT
+				CURLSSLOPT_REVOKE_BEST_EFFORT;
+#else
+				CURLSSLOPT_NO_REVOKE;
+			warning(_("%s=%s unsupported by current cURL"),
+				var, value);
+#endif
+		} else
+			http_schannel_check_revoke_mode =
+				(git_config_bool(var, value) ?
+				 0 : CURLSSLOPT_NO_REVOKE);
 		return 0;
 	}
 
@@ -414,6 +434,11 @@ static int http_options(const char *var, const char *value,
 		return 0;
 	}
 
+	if (!strcmp("http.sslautoclientcert", var)) {
+		http_auto_client_cert = git_config_bool(var, value);
+		return 0;
+	}
+
 	if (!strcmp("http.minsessions", var)) {
 		min_curl_sessions = git_config_int(var, value, ctx->kvi);
 		if (min_curl_sessions > 1)
@@ -1016,13 +1041,24 @@ static CURL *get_curl_handle(void)
 	}
 #endif
 
-	if (http_ssl_backend && !strcmp("schannel", http_ssl_backend) &&
-	    !http_schannel_check_revoke) {
+	if (http_ssl_backend && !strcmp("schannel", http_ssl_backend)) {
+		long ssl_options = 0;
+		if (http_schannel_check_revoke_mode) {
 #ifdef GIT_CURL_HAVE_CURLSSLOPT_NO_REVOKE
-		curl_easy_setopt(result, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NO_REVOKE);
+			ssl_options |= http_schannel_check_revoke_mode;
 #else
-		warning(_("CURLSSLOPT_NO_REVOKE not supported with cURL < 7.44.0"));
+			warning(_("CURLSSLOPT_NO_REVOKE not supported with cURL < 7.44.0"));
 #endif
+		}
+
+		if (http_auto_client_cert) {
+#ifdef GIT_CURL_HAVE_CURLSSLOPT_AUTO_CLIENT_CERT
+			ssl_options |= CURLSSLOPT_AUTO_CLIENT_CERT;
+#endif
+		}
+
+		if (ssl_options)
+			curl_easy_setopt(result, CURLOPT_SSL_OPTIONS, ssl_options);
 	}
 
 	if (http_proactive_auth)
diff --git a/object-file.c b/object-file.c
index 7c7afe5793..8a3b8609eb 100644
--- a/object-file.c
+++ b/object-file.c
@@ -1778,9 +1778,9 @@ void *read_object_with_reference(struct repository *r,
 }
 
 static void hash_object_body(const struct git_hash_algo *algo, git_hash_ctx *c,
-			     const void *buf, unsigned long len,
+			     const void *buf, size_t len,
 			     struct object_id *oid,
-			     char *hdr, int *hdrlen)
+			     char *hdr, size_t *hdrlen)
 {
 	algo->init_fn(c);
 	algo->update_fn(c, hdr, *hdrlen);
@@ -1789,23 +1789,23 @@ static void hash_object_body(const struct git_hash_algo *algo, git_hash_ctx *c,
 }
 
 static void write_object_file_prepare(const struct git_hash_algo *algo,
-				      const void *buf, unsigned long len,
+				      const void *buf, size_t len,
 				      enum object_type type, struct object_id *oid,
-				      char *hdr, int *hdrlen)
+				      char *hdr, size_t *hdrlen)
 {
 	git_hash_ctx c;
 
 	/* Generate the header */
 	*hdrlen = format_object_header(hdr, *hdrlen, type, len);
 
-	/* Sha1.. */
+	/* Hash (function pointers) computation */
 	hash_object_body(algo, &c, buf, len, oid, hdr, hdrlen);
 }
 
 static void write_object_file_prepare_literally(const struct git_hash_algo *algo,
-				      const void *buf, unsigned long len,
+				      const void *buf, size_t len,
 				      const char *type, struct object_id *oid,
-				      char *hdr, int *hdrlen)
+				      char *hdr, size_t *hdrlen)
 {
 	git_hash_ctx c;
 
@@ -1857,17 +1857,17 @@ out:
 }
 
 static void hash_object_file_literally(const struct git_hash_algo *algo,
-				       const void *buf, unsigned long len,
+				       const void *buf, size_t len,
 				       const char *type, struct object_id *oid)
 {
 	char hdr[MAX_HEADER_LEN];
-	int hdrlen = sizeof(hdr);
+	size_t hdrlen = sizeof(hdr);
 
 	write_object_file_prepare_literally(algo, buf, len, type, oid, hdr, &hdrlen);
 }
 
 void hash_object_file(const struct git_hash_algo *algo, const void *buf,
-		      unsigned long len, enum object_type type,
+		      size_t len, enum object_type type,
 		      struct object_id *oid)
 {
 	hash_object_file_literally(algo, buf, len, type_name(type), oid);
@@ -2209,12 +2209,12 @@ cleanup:
 	return err;
 }
 
-int write_object_file_flags(const void *buf, unsigned long len,
+int write_object_file_flags(const void *buf, size_t len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags)
 {
 	char hdr[MAX_HEADER_LEN];
-	int hdrlen = sizeof(hdr);
+	size_t hdrlen = sizeof(hdr);
 
 	/* Normally if we have it in the pack then we do not bother writing
 	 * it out into .git/objects/??/?{38} file.
@@ -2226,12 +2226,13 @@ int write_object_file_flags(const void *buf, unsigned long len,
 	return write_loose_object(oid, hdr, hdrlen, buf, len, 0, flags);
 }
 
-int write_object_file_literally(const void *buf, unsigned long len,
+int write_object_file_literally(const void *buf, size_t len,
 				const char *type, struct object_id *oid,
 				unsigned flags)
 {
 	char *header;
-	int hdrlen, status = 0;
+	size_t hdrlen;
+	int status = 0;
 
 	/* type string, SP, %lu of the length plus NUL must fit this */
 	hdrlen = strlen(type) + MAX_HEADER_LEN;
diff --git a/object-store-ll.h b/object-store-ll.h
index 26a3895c82..bb1b47f956 100644
--- a/object-store-ll.h
+++ b/object-store-ll.h
@@ -247,10 +247,10 @@ void *repo_read_object_file(struct repository *r,
 int oid_object_info(struct repository *r, const struct object_id *, unsigned long *);
 
 void hash_object_file(const struct git_hash_algo *algo, const void *buf,
-		      unsigned long len, enum object_type type,
+		      size_t len, enum object_type type,
 		      struct object_id *oid);
 
-int write_object_file_flags(const void *buf, unsigned long len,
+int write_object_file_flags(const void *buf, size_t len,
 			    enum object_type type, struct object_id *oid,
 			    unsigned flags);
 static inline int write_object_file(const void *buf, unsigned long len,
@@ -259,7 +259,7 @@ static inline int write_object_file(const void *buf, unsigned long len,
 	return write_object_file_flags(buf, len, type, oid, 0);
 }
 
-int write_object_file_literally(const void *buf, unsigned long len,
+int write_object_file_literally(const void *buf, size_t len,
 				const char *type, struct object_id *oid,
 				unsigned flags);
 int stream_loose_object(struct input_stream *in_stream, size_t len,
diff --git a/path.c b/path.c
index 67e2690efe..2645508815 100644
--- a/path.c
+++ b/path.c
@@ -1332,6 +1332,45 @@ char *strip_path_suffix(const char *path, const char *suffix)
 	return offset == -1 ? NULL : xstrndup(path, offset);
 }
 
+int is_mount_point_via_stat(struct strbuf *path)
+{
+	size_t len = path->len;
+	unsigned int current_dev;
+	struct stat st;
+
+	if (!strcmp("/", path->buf))
+		return 1;
+
+	strbuf_addstr(path, "/.");
+	if (lstat(path->buf, &st)) {
+		/*
+		 * If we cannot access the current directory, we cannot say
+		 * that it is a bind mount.
+		 */
+		strbuf_setlen(path, len);
+		return 0;
+	}
+	current_dev = st.st_dev;
+
+	/* Now look at the parent directory */
+	strbuf_addch(path, '.');
+	if (lstat(path->buf, &st)) {
+		/*
+		 * If we cannot access the parent directory, we cannot say
+		 * that it is a bind mount.
+		 */
+		strbuf_setlen(path, len);
+		return 0;
+	}
+	strbuf_setlen(path, len);
+
+	/*
+	 * If the device ID differs between current and parent directory,
+	 * then it is a bind mount.
+	 */
+	return current_dev != st.st_dev;
+}
+
 int daemon_avoid_alias(const char *p)
 {
 	int sl, ndot;
diff --git a/path.h b/path.h
index 639372edd9..db91e1213c 100644
--- a/path.h
+++ b/path.h
@@ -198,6 +198,7 @@ int normalize_path_copy(char *dst, const char *src);
 int strbuf_normalize_path(struct strbuf *src);
 int longest_ancestor_length(const char *path, struct string_list *prefixes);
 char *strip_path_suffix(const char *path, const char *suffix);
+int is_mount_point_via_stat(struct strbuf *path);
 int daemon_avoid_alias(const char *path);
 
 /*
diff --git a/prompt.c b/prompt.c
index 8935fe4dfb..cfce7e15a0 100644
--- a/prompt.c
+++ b/prompt.c
@@ -78,7 +78,7 @@ int git_read_line_interactively(struct strbuf *line)
 	int ret;
 
 	fflush(stdout);
-	ret = strbuf_getline_lf(line, stdin);
+	ret = strbuf_getline(line, stdin);
 	if (ret != EOF)
 		strbuf_trim_trailing_newline(line);
 
diff --git a/repo-settings.c b/repo-settings.c
index 525f69c0c7..9f711c8775 100644
--- a/repo-settings.c
+++ b/repo-settings.c
@@ -2,7 +2,8 @@
 #include "config.h"
 #include "repository.h"
 #include "midx.h"
-#include "compat/fsmonitor/fsm-listen.h"
+#include "fsmonitor-ipc.h"
+#include "fsmonitor-settings.h"
 
 static void repo_cfg_bool(struct repository *r, const char *key, int *dest,
 			  int def)
@@ -44,6 +45,30 @@ void prepare_repo_settings(struct repository *r)
 	if (experimental) {
 		r->settings.fetch_negotiation_algorithm = FETCH_NEGOTIATION_SKIPPING;
 		r->settings.pack_use_bitmap_boundary_traversal = 1;
+
+		/*
+		 * Force enable the builtin FSMonitor (unless the repo
+		 * is incompatible or they've already selected it or
+		 * the hook version).  But only if they haven't
+		 * explicitly turned it off -- so only if our config
+		 * value is UNSET.
+		 *
+		 * lookup_fsmonitor_settings() and check_for_ipc() do
+		 * not distinguish between explicitly set FALSE and
+		 * UNSET, so we re-test for an UNSET config key here.
+		 *
+		 * I'm not sure I want to fix fsmonitor-settings.c to
+		 * have more than one _DISABLED state since our usage
+		 * here is only to support this experimental period
+		 * (and I don't want to overload the _reason field
+		 * because it describes incompabilities).
+		 */
+		if (manyfiles &&
+		    fsmonitor_ipc__is_supported()  &&
+		    fsm_settings__get_mode(r) == FSMONITOR_MODE_DISABLED &&
+		    repo_config_get_maybe_bool(r, "core.fsmonitor", &value) > 0 &&
+		    repo_config_get_bool(r, "core.useBuiltinFSMonitor", &value))
+			fsm_settings__set_ipc(r);
 	}
 	if (manyfiles) {
 		r->settings.index_version = 4;
diff --git a/repository.c b/repository.c
index a7679ceeaa..0594a13a3f 100644
--- a/repository.c
+++ b/repository.c
@@ -53,7 +53,7 @@ static void repo_set_commondir(struct repository *repo,
 {
 	struct strbuf sb = STRBUF_INIT;
 
-	free(repo->commondir);
+	FREE_AND_NULL(repo->commondir);
 
 	if (commondir) {
 		repo->different_commondir = 1;
diff --git a/send-pack.c b/send-pack.c
index 89aca9d829..6a0deda969 100644
--- a/send-pack.c
+++ b/send-pack.c
@@ -479,7 +479,7 @@ int send_pack(struct send_pack_args *args,
 	int need_pack_data = 0;
 	int allow_deleting_refs = 0;
 	int status_report = 0;
-	int use_sideband = 0;
+	int use_sideband = 1;
 	int quiet_supported = 0;
 	int agent_supported = 0;
 	int advertise_sid = 0;
@@ -502,6 +502,7 @@ int send_pack(struct send_pack_args *args,
 		return 0;
 	}
 
+	git_config_get_bool("sendpack.sideband", &use_sideband);
 	git_config_get_bool("push.negotiate", &push_negotiate);
 	if (push_negotiate)
 		get_commons_through_negotiation(args->url, remote_refs, &commons);
@@ -520,8 +521,7 @@ int send_pack(struct send_pack_args *args,
 		allow_deleting_refs = 1;
 	if (server_supports("ofs-delta"))
 		args->use_ofs_delta = 1;
-	if (server_supports("side-band-64k"))
-		use_sideband = 1;
+	use_sideband = use_sideband && server_supports("side-band-64k");
 	if (server_supports("quiet"))
 		quiet_supported = 1;
 	if (server_supports("agent"))
diff --git a/setup.c b/setup.c
index fc592dc6dd..cc412b6307 100644
--- a/setup.c
+++ b/setup.c
@@ -1493,10 +1493,19 @@ const char *setup_git_directory_gently(int *nongit_ok)
 		break;
 	case GIT_DIR_INVALID_OWNERSHIP:
 		if (!nongit_ok) {
+			struct strbuf prequoted = STRBUF_INIT;
 			struct strbuf quoted = STRBUF_INIT;
 
 			strbuf_complete(&report, '\n');
-			sq_quote_buf_pretty(&quoted, dir.buf);
+
+#ifdef __MINGW32__
+			if (dir.buf[0] == '/')
+				strbuf_addstr(&prequoted, "%(prefix)/");
+#endif
+
+			strbuf_add(&prequoted, dir.buf, dir.len);
+			sq_quote_buf_pretty(&quoted, prequoted.buf);
+
 			die(_("detected dubious ownership in repository at '%s'\n"
 			      "%s"
 			      "To add an exception for this directory, call:\n"
@@ -2138,7 +2147,7 @@ int init_db(const char *git_dir, const char *real_git_dir,
 	startup_info->have_repository = 1;
 
 	/* Ensure `core.hidedotfiles` is processed */
-	git_config(platform_core_config, NULL);
+	git_config(git_default_core_config, NULL);
 
 	safe_create_dir(git_dir, 0);
 
diff --git a/sha1dc_git.c b/sha1dc_git.c
index 9b675a046e..fe58d7962a 100644
--- a/sha1dc_git.c
+++ b/sha1dc_git.c
@@ -27,10 +27,9 @@ void git_SHA1DCFinal(unsigned char hash[20], SHA1_CTX *ctx)
 /*
  * Same as SHA1DCUpdate, but adjust types to match git's usual interface.
  */
-void git_SHA1DCUpdate(SHA1_CTX *ctx, const void *vdata, unsigned long len)
+void git_SHA1DCUpdate(SHA1_CTX *ctx, const void *vdata, size_t len)
 {
 	const char *data = vdata;
-	/* We expect an unsigned long, but sha1dc only takes an int */
 	while (len > INT_MAX) {
 		SHA1DCUpdate(ctx, data, INT_MAX);
 		data += INT_MAX;
diff --git a/sha1dc_git.h b/sha1dc_git.h
index 60e3ce8439..159540e13a 100644
--- a/sha1dc_git.h
+++ b/sha1dc_git.h
@@ -15,7 +15,7 @@ void git_SHA1DCInit(SHA1_CTX *);
 #endif
 
 void git_SHA1DCFinal(unsigned char [20], SHA1_CTX *);
-void git_SHA1DCUpdate(SHA1_CTX *ctx, const void *data, unsigned long len);
+void git_SHA1DCUpdate(SHA1_CTX *ctx, const void *data, size_t len);
 
 #define platform_SHA_IS_SHA1DC /* used by "test-tool sha1-is-sha1dc" */
 #define platform_SHA_CTX SHA1_CTX
diff --git a/t/t0014-alias.sh b/t/t0014-alias.sh
index 95568342be..30d2b7e76e 100755
--- a/t/t0014-alias.sh
+++ b/t/t0014-alias.sh
@@ -38,10 +38,10 @@ test_expect_success 'looping aliases - internal execution' '
 #'
 
 test_expect_success 'run-command formats empty args properly' '
-    test_must_fail env GIT_TRACE=1 git frotz a "" b " " c 2>actual.raw &&
-    sed -ne "/run_command:/s/.*trace: run_command: //p" actual.raw >actual &&
-    echo "git-frotz a '\'''\'' b '\'' '\'' c" >expect &&
-    test_cmp expect actual
+	test_must_fail env GIT_TRACE=1 git frotz a "" b " " c 2>actual.raw &&
+	sed -ne "/run_command:/s/.*trace: run_command: //p" actual.raw >actual &&
+	echo "git-frotz a '\'''\'' b '\'' '\'' c" >expect &&
+	test_cmp expect actual
 '
 
 test_done
diff --git a/t/t0060-path-utils.sh b/t/t0060-path-utils.sh
index 0afa3d0d31..54123c5ab6 100755
--- a/t/t0060-path-utils.sh
+++ b/t/t0060-path-utils.sh
@@ -282,6 +282,14 @@ test_expect_success SYMLINKS 'real path works on symlinks' '
 	test_cmp expect actual
 '
 
+test_expect_success MINGW 'real path works near drive root' '
+	# we need a non-existing path at the drive root; simply skip if C:/xyz exists
+	if test ! -e C:/xyz
+	then
+		test C:/xyz = $(test-tool path-utils real_path C:/xyz)
+	fi
+'
+
 test_expect_success SYMLINKS 'prefix_path works with absolute paths to work tree symlinks' '
 	ln -s target symlink &&
 	echo "symlink" >expect &&
@@ -599,7 +607,8 @@ test_expect_success !VALGRIND,RUNTIME_PREFIX,CAN_EXEC_IN_PWD 'RUNTIME_PREFIX wor
 	cp "$GIT_EXEC_PATH"/git$X pretend/bin/ &&
 	GIT_EXEC_PATH= ./pretend/bin/git here >actual &&
 	echo HERE >expect &&
-	test_cmp expect actual'
+	test_cmp expect actual
+'
 
 test_expect_success !VALGRIND,RUNTIME_PREFIX,CAN_EXEC_IN_PWD '%(prefix)/ works' '
 	mkdir -p pretend/bin &&
@@ -610,4 +619,32 @@ test_expect_success !VALGRIND,RUNTIME_PREFIX,CAN_EXEC_IN_PWD '%(prefix)/ works'
 	test_cmp expect actual
 '
 
+test_expect_success MINGW 'MSYSTEM/PATH is adjusted if necessary' '
+	mkdir -p "$HOME"/bin pretend/mingw64/bin \
+		pretend/mingw64/libexec/git-core pretend/usr/bin &&
+	cp "$GIT_EXEC_PATH"/git.exe pretend/mingw64/bin/ &&
+	cp "$GIT_EXEC_PATH"/git.exe pretend/mingw64/libexec/git-core/ &&
+	# copy the .dll files, if any (happens when building via CMake)
+	case "$GIT_EXEC_PATH"/*.dll in
+	*/"*.dll") ;; # no `.dll` files to be copied
+	*)
+		cp "$GIT_EXEC_PATH"/*.dll pretend/mingw64/bin/ &&
+		cp "$GIT_EXEC_PATH"/*.dll pretend/mingw64/libexec/git-core/
+		;;
+	esac &&
+	echo "env | grep MSYSTEM=" | write_script "$HOME"/bin/git-test-home &&
+	echo "echo mingw64" | write_script pretend/mingw64/bin/git-test-bin &&
+	echo "echo usr" | write_script pretend/usr/bin/git-test-bin2 &&
+
+	(
+		MSYSTEM= &&
+		GIT_EXEC_PATH= &&
+		pretend/mingw64/libexec/git-core/git.exe test-home >actual &&
+		pretend/mingw64/libexec/git-core/git.exe test-bin >>actual &&
+		pretend/mingw64/bin/git.exe test-bin2 >>actual
+	) &&
+	test_write_lines MSYSTEM=$MSYSTEM mingw64 usr >expect &&
+	test_cmp expect actual
+'
+
 test_done
diff --git a/t/t1007-hash-object.sh b/t/t1007-hash-object.sh
index ac3d173767..cb7d051fb6 100755
--- a/t/t1007-hash-object.sh
+++ b/t/t1007-hash-object.sh
@@ -50,6 +50,9 @@ test_expect_success 'setup' '
 
 	example sha1:ddd3f836d3e3fbb7ae289aa9ae83536f76956399
 	example sha256:b44fe1fe65589848253737db859bd490453510719d7424daab03daf0767b85ae
+
+	large5GB sha1:0be2be10a4c8764f32c4bf372a98edc731a4b204
+	large5GB sha256:dc18ca621300c8d3cfa505a275641ebab00de189859e022a975056882d313e64
 	EOF
 '
 
@@ -260,4 +263,40 @@ test_expect_success '--literally with extra-long type' '
 	echo example | git hash-object -t $t --literally --stdin
 '
 
+test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
+		'files over 4GB hash literally' '
+	test-tool genzeros $((5*1024*1024*1024)) >big &&
+	test_oid large5GB >expect &&
+	git hash-object --stdin --literally <big >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
+		'files over 4GB hash correctly via --stdin' '
+	{ test -f big || test-tool genzeros $((5*1024*1024*1024)) >big; } &&
+	test_oid large5GB >expect &&
+	git hash-object --stdin <big >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
+		'files over 4GB hash correctly' '
+	{ test -f big || test-tool genzeros $((5*1024*1024*1024)) >big; } &&
+	test_oid large5GB >expect &&
+	git hash-object -- big >actual &&
+	test_cmp expect actual
+'
+
+# This clean filter does nothing, other than excercising the interface.
+# We ensure that cleaning doesn't mangle large files on 64-bit Windows.
+test_expect_success EXPENSIVE,SIZE_T_IS_64BIT,!LONG_IS_64BIT \
+		'hash filtered files over 4GB correctly' '
+	{ test -f big || test-tool genzeros $((5*1024*1024*1024)) >big; } &&
+	test_oid large5GB >expect &&
+	test_config filter.null-filter.clean "cat" &&
+	echo "big filter=null-filter" >.gitattributes &&
+	git hash-object -- big >actual &&
+	test_cmp expect actual
+'
+
 test_done
diff --git a/t/t3700-add.sh b/t/t3700-add.sh
index f23d39f0d5..8aafaaf463 100755
--- a/t/t3700-add.sh
+++ b/t/t3700-add.sh
@@ -506,4 +506,15 @@ test_expect_success CASE_INSENSITIVE_FS 'path is case-insensitive' '
 	git add "$downcased"
 '
 
+test_expect_success MINGW 'can add files via NTFS junctions' '
+	test_when_finished "cmd //c rmdir junction && rm -rf target" &&
+	test_create_repo target &&
+	cmd //c "mklink /j junction target" &&
+	>target/via-junction &&
+	git -C junction add "$(pwd)/junction/via-junction" &&
+	echo via-junction >expect &&
+	git -C target diff --cached --name-only >actual &&
+	test_cmp expect actual
+'
+
 test_done
diff --git a/t/t3701-add-interactive.sh b/t/t3701-add-interactive.sh
index 0b5339ac6c..f7d91bf711 100755
--- a/t/t3701-add-interactive.sh
+++ b/t/t3701-add-interactive.sh
@@ -1085,6 +1085,27 @@ test_expect_success 'checkout -p patch editing of added file' '
 	)
 '
 
+test_expect_success EXPENSIVE 'add -i with a lot of files' '
+	git reset --hard &&
+	x160=0123456789012345678901234567890123456789 &&
+	x160=$x160$x160$x160$x160 &&
+	y= &&
+	i=0 &&
+	while test $i -le 200
+	do
+		name=$(printf "%s%03d" $x160 $i) &&
+		echo $name >$name &&
+		git add -N $name &&
+		y="${y}y$LF" &&
+		i=$(($i+1)) ||
+		exit 1
+	done &&
+	echo "$y" | git add -p -- . &&
+	git diff --cached >staged &&
+	test_line_count = 1407 staged &&
+	git reset --hard
+'
+
 test_expect_success 'show help from add--helper' '
 	git reset --hard &&
 	cat >expect <<-EOF &&
diff --git a/t/t5505-remote.sh b/t/t5505-remote.sh
index 7789ff12c4..5caef356f6 100755
--- a/t/t5505-remote.sh
+++ b/t/t5505-remote.sh
@@ -835,8 +835,8 @@ test_expect_success '"remote show" does not show symbolic refs' '
 	(
 		cd three &&
 		git remote show origin >output &&
-		! grep "^ *HEAD$" < output &&
-		! grep -i stale < output
+		! grep "^ *HEAD$" <output &&
+		! grep -i stale <output
 	)
 '
 
@@ -1039,7 +1039,7 @@ test_expect_success 'migrate a remote from named file in $GIT_DIR/branches' '
 	(
 		cd six &&
 		git remote rm origin &&
-		mkdir .git/branches &&
+		mkdir -p .git/branches &&
 		echo "$origin_url#main" >.git/branches/origin &&
 		git remote rename origin origin &&
 		test_path_is_missing .git/branches/origin &&
@@ -1054,8 +1054,8 @@ test_expect_success 'migrate a remote from named file in $GIT_DIR/branches (2)'
 	(
 		cd seven &&
 		git remote rm origin &&
-		mkdir .git/branches &&
-		echo "quux#foom" > .git/branches/origin &&
+		mkdir -p .git/branches &&
+		echo "quux#foom" >.git/branches/origin &&
 		git remote rename origin origin &&
 		test_path_is_missing .git/branches/origin &&
 		test "$(git config remote.origin.url)" = "quux" &&
diff --git a/t/t5516-fetch-push.sh b/t/t5516-fetch-push.sh
index 2e7c0e1648..16bc9c0045 100755
--- a/t/t5516-fetch-push.sh
+++ b/t/t5516-fetch-push.sh
@@ -969,8 +969,8 @@ test_expect_success 'fetch with branches' '
 	mk_empty testrepo &&
 	git branch second $the_first_commit &&
 	git checkout second &&
-	mkdir testrepo/.git/branches &&
-	echo ".." > testrepo/.git/branches/branch1 &&
+	mkdir -p testrepo/.git/branches &&
+	echo ".." >testrepo/.git/branches/branch1 &&
 	(
 		cd testrepo &&
 		git fetch branch1 &&
@@ -983,8 +983,8 @@ test_expect_success 'fetch with branches' '
 
 test_expect_success 'fetch with branches containing #' '
 	mk_empty testrepo &&
-	mkdir testrepo/.git/branches &&
-	echo "..#second" > testrepo/.git/branches/branch2 &&
+	mkdir -p testrepo/.git/branches &&
+	echo "..#second" >testrepo/.git/branches/branch2 &&
 	(
 		cd testrepo &&
 		git fetch branch2 &&
@@ -1000,8 +1000,8 @@ test_expect_success 'push with branches' '
 	git checkout second &&
 
 	test_when_finished "rm -rf .git/branches" &&
-	mkdir .git/branches &&
-	echo "testrepo" > .git/branches/branch1 &&
+	mkdir -p .git/branches &&
+	echo "testrepo" >.git/branches/branch1 &&
 
 	git push branch1 &&
 	(
@@ -1016,8 +1016,8 @@ test_expect_success 'push with branches containing #' '
 	mk_empty testrepo &&
 
 	test_when_finished "rm -rf .git/branches" &&
-	mkdir .git/branches &&
-	echo "testrepo#branch3" > .git/branches/branch2 &&
+	mkdir -p .git/branches &&
+	echo "testrepo#branch3" >.git/branches/branch2 &&
 
 	git push branch2 &&
 	(
@@ -1546,7 +1546,7 @@ EOF
 	git init no-thin &&
 	git --git-dir=no-thin/.git config receive.unpacklimit 0 &&
 	git push no-thin/.git refs/heads/main:refs/heads/foo &&
-	echo modified >> path1 &&
+	echo modified >>path1 &&
 	git commit -am modified &&
 	git repack -adf &&
 	rcvpck="git receive-pack --reject-thin-pack-for-testing" &&
diff --git a/t/t5580-unc-paths.sh b/t/t5580-unc-paths.sh
index d7537a162b..a513dede7e 100755
--- a/t/t5580-unc-paths.sh
+++ b/t/t5580-unc-paths.sh
@@ -21,14 +21,11 @@ fi
 UNCPATH="$(winpwd)"
 case "$UNCPATH" in
 [A-Z]:*)
+	WITHOUTDRIVE="${UNCPATH#?:}"
 	# Use administrative share e.g. \\localhost\C$\git-sdk-64\usr\src\git
 	# (we use forward slashes here because MSYS2 and Git accept them, and
 	# they are easier on the eyes)
-	UNCPATH="//localhost/${UNCPATH%%:*}\$/${UNCPATH#?:}"
-	test -d "$UNCPATH" || {
-		skip_all='could not access administrative share; skipping'
-		test_done
-	}
+	UNCPATH="//localhost/${UNCPATH%%:*}\$$WITHOUTDRIVE"
 	;;
 *)
 	skip_all='skipping UNC path tests, cannot determine current path as UNC'
@@ -36,6 +33,18 @@ case "$UNCPATH" in
 	;;
 esac
 
+test_expect_success 'clone into absolute path lacking a drive prefix' '
+	USINGBACKSLASHES="$(echo "$WITHOUTDRIVE"/without-drive-prefix |
+		tr / \\\\)" &&
+	git clone . "$USINGBACKSLASHES" &&
+	test -f without-drive-prefix/.git/HEAD
+'
+
+test -d "$UNCPATH" || {
+	skip_all='could not access administrative share; skipping'
+	test_done
+}
+
 test_expect_success setup '
 	test_commit initial
 '
diff --git a/t/t5601-clone.sh b/t/t5601-clone.sh
index 47eae641f0..cff9757443 100755
--- a/t/t5601-clone.sh
+++ b/t/t5601-clone.sh
@@ -71,6 +71,13 @@ test_expect_success 'clone respects GIT_WORK_TREE' '
 
 '
 
+test_expect_success CASE_INSENSITIVE_FS 'core.worktree is not added due to path case' '
+
+	mkdir UPPERCASE &&
+	git clone src "$(pwd)/uppercase" &&
+	test "unset" = "$(git -C UPPERCASE config --default unset core.worktree)"
+'
+
 test_expect_success 'clone from hooks' '
 
 	test_create_repo r0 &&
diff --git a/t/t5801-remote-helpers.sh b/t/t5801-remote-helpers.sh
index 4e0a77f985..351c05ee28 100755
--- a/t/t5801-remote-helpers.sh
+++ b/t/t5801-remote-helpers.sh
@@ -239,7 +239,7 @@ test_expect_success 'push update refs failure' '
 	echo "update fail" >>file &&
 	git commit -a -m "update fail" &&
 	git rev-parse --verify testgit/origin/heads/update >expect &&
-	test_expect_code 1 env GIT_REMOTE_TESTGIT_FAILURE="non-fast forward" \
+	test_must_fail env GIT_REMOTE_TESTGIT_FAILURE="non-fast forward" \
 		git push origin update &&
 	git rev-parse --verify testgit/origin/heads/update >actual &&
 	test_cmp expect actual
diff --git a/t/t7300-clean.sh b/t/t7300-clean.sh
index 1a310a45fd..ff0def0b7a 100755
--- a/t/t7300-clean.sh
+++ b/t/t7300-clean.sh
@@ -789,4 +789,14 @@ test_expect_success 'traverse into directories that may have ignored entries' '
 	)
 '
 
+test_expect_success MINGW 'clean does not traverse mount points' '
+	mkdir target &&
+	>target/dont-clean-me &&
+	git init with-mountpoint &&
+	cmd //c "mklink /j with-mountpoint\\mountpoint target" &&
+	git -C with-mountpoint clean -dfx &&
+	test_path_is_missing with-mountpoint/mountpoint &&
+	test_path_is_file target/dont-clean-me
+'
+
 test_done
diff --git a/t/t7502-commit-porcelain.sh b/t/t7502-commit-porcelain.sh
index 61c8e810cc..06c9e4cfe2 100755
--- a/t/t7502-commit-porcelain.sh
+++ b/t/t7502-commit-porcelain.sh
@@ -605,6 +605,48 @@ test_expect_success 'cleanup commit messages (scissors option,-F,-e, scissors on
 	test_must_be_empty actual
 '
 
+test_expect_success 'helper-editor' '
+
+	write_script lf-to-crlf.sh <<-\EOF
+	sed "s/\$/Q/" <"$1" | tr Q "\\015" >"$1".new &&
+	mv -f "$1".new "$1"
+	EOF
+'
+
+test_expect_success 'cleanup commit messages (scissors option,-F,-e, CR/LF line endings)' '
+
+	test_config core.editor "\"$PWD/lf-to-crlf.sh\"" &&
+	scissors="# ------------------------ >8 ------------------------" &&
+
+	test_write_lines >text \
+	"# Keep this comment" "" " $scissors" \
+	"# Keep this comment, too" "$scissors" \
+	"# Remove this comment" "$scissors" \
+	"Remove this comment, too" &&
+
+	test_write_lines >expect \
+	"# Keep this comment" "" " $scissors" \
+	"# Keep this comment, too" &&
+
+	git commit --cleanup=scissors -e -F text --allow-empty &&
+	git cat-file -p HEAD >raw &&
+	sed -e "1,/^\$/d" raw >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'cleanup commit messages (scissors option,-F,-e, scissors on first line, CR/LF line endings)' '
+
+	scissors="# ------------------------ >8 ------------------------" &&
+	test_write_lines >text \
+	"$scissors" \
+	"# Remove this comment and any following lines" &&
+	cp text /tmp/test2-text &&
+	git commit --cleanup=scissors -e -F text --allow-empty --allow-empty-message &&
+	git cat-file -p HEAD >raw &&
+	sed -e "1,/^\$/d" raw >actual &&
+	test_must_be_empty actual
+'
+
 test_expect_success 'cleanup commit messages (strip option,-F)' '
 
 	echo >>negative &&
diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh
index 26c25c0eb2..cd4673172c 100755
--- a/t/t9350-fast-export.sh
+++ b/t/t9350-fast-export.sh
@@ -791,4 +791,15 @@ test_expect_success 'fast-export --first-parent outputs all revisions output by
 	)
 '
 
+cat > expected << EOF
+reset refs/heads/master
+from $(git rev-parse master)
+
+EOF
+
+test_expect_failure 'refs are updated even if no commits need to be exported' '
+	git fast-export master..master > actual &&
+	test_cmp expected actual
+'
+
 test_done
diff --git a/transport-helper.c b/transport-helper.c
index 49811ef176..ba367b5bee 100644
--- a/transport-helper.c
+++ b/transport-helper.c
@@ -21,6 +21,8 @@
 #include "protocol.h"
 
 static int debug;
+/* TODO: put somewhere sensible, e.g. git_transport_options? */
+static int auto_gc = 1;
 
 struct helper_data {
 	const char *name;
@@ -483,10 +485,25 @@ static int get_exporter(struct transport *transport,
 	for (i = 0; i < revlist_args->nr; i++)
 		strvec_push(&fastexport->args, revlist_args->items[i].string);
 
+	strvec_push(&fastexport->args, "--");
+
 	fastexport->git_cmd = 1;
 	return start_command(fastexport);
 }
 
+static void check_helper_status(struct helper_data *data)
+{
+	int pid, status;
+
+	pid = waitpid(data->helper->pid, &status, WNOHANG);
+	if (pid < 0)
+		die("Could not retrieve status of remote helper '%s'",
+		    data->name);
+	if (pid > 0 && WIFEXITED(status))
+		die("Remote helper '%s' died with %d",
+		    data->name, WEXITSTATUS(status));
+}
+
 static int fetch_with_import(struct transport *transport,
 			     int nr_heads, struct ref **to_fetch)
 {
@@ -523,6 +540,7 @@ static int fetch_with_import(struct transport *transport,
 
 	if (finish_command(&fastimport))
 		die(_("error while running fast-import"));
+	check_helper_status(data);
 
 	/*
 	 * The fast-import stream of a remote helper that advertises
@@ -556,6 +574,13 @@ static int fetch_with_import(struct transport *transport,
 		}
 	}
 	strbuf_release(&buf);
+	if (auto_gc) {
+		struct child_process cmd = CHILD_PROCESS_INIT;
+
+		cmd.git_cmd = 1;
+		strvec_pushl(&cmd.args, "gc", "--auto", "--quiet", NULL);
+		run_command(&cmd);
+	}
 	return 0;
 }
 
@@ -1127,6 +1152,7 @@ static int push_refs_with_export(struct transport *transport,
 
 	if (finish_command(&exporter))
 		die(_("error while running fast-export"));
+	check_helper_status(data);
 	if (push_update_refs_status(data, remote_refs, flags))
 		return 1;
 
diff --git a/wt-status.c b/wt-status.c
index 9f45bf6949..96ba3c94dc 100644
--- a/wt-status.c
+++ b/wt-status.c
@@ -36,7 +36,7 @@
 #define UF_DELAY_WARNING_IN_MS (2 * 1000)
 
 static const char cut_line[] =
-"------------------------ >8 ------------------------\n";
+"------------------------ >8 ------------------------";
 
 static char default_wt_status_colors[][COLOR_MAXLEN] = {
 	GIT_COLOR_NORMAL, /* WT_STATUS_HEADER */
@@ -1084,15 +1084,22 @@ conclude:
 	status_printf_ln(s, GIT_COLOR_NORMAL, "%s", "");
 }
 
+static inline int starts_with_newline(const char *p)
+{
+    return *p == '\n' || (*p == '\r' && p[1] == '\n');
+}
+
 size_t wt_status_locate_end(const char *s, size_t len)
 {
 	const char *p;
 	struct strbuf pattern = STRBUF_INIT;
 
 	strbuf_addf(&pattern, "\n%c %s", comment_line_char, cut_line);
-	if (starts_with(s, pattern.buf + 1))
+	if (starts_with(s, pattern.buf + 1) &&
+	    starts_with_newline(s + pattern.len - 1))
 		len = 0;
-	else if ((p = strstr(s, pattern.buf)))
+	else if ((p = strstr(s, pattern.buf)) &&
+		 starts_with_newline(p + pattern.len))
 		len = p - s + 1;
 	strbuf_release(&pattern);
 	return len;