fix: in Lock, prevent flambda from reordering mutex-protected operations

inspired by Mutex.protect in the stdlib, which is also `[@inline never]` for this reason
document how many threads are used for work in Ws_pool
2025-12-20 01:26:42 -05:00 · 2025-11-19 12:28:09 -05:00 · 2025-11-19 12:24:33 -05:00 · 2025-11-14 22:56:37 -05:00 · 2025-11-13 20:27:01 -05:00 · 2025-11-13 19:58:54 -05:00
352 changed files with 14382 additions and 1059 deletions
--- a/.github/workflows/gh-pages.yml
+++ b/.github/workflows/gh-pages.yml
@ -0,0 +1,35 @@
+name: github pages
+
+on:
+  push:
+    branches:
+      - main # Set a branch name to trigger deployment
+
+jobs:
+  deploy:
+    name: Deploy doc
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@main
+
+      - name: Use OCaml
+        uses: ocaml/setup-ocaml@v3
+        with:
+          ocaml-compiler: '5.3'
+          dune-cache: true
+          allow-prerelease-opam: true
+
+      # temporary until it's in a release
+      - run: opam pin picos 0.6.0 -y -n
+
+      - run: opam install odig moonpool moonpool-lwt -t
+
+      - run: opam exec -- odig odoc --cache-dir=_doc/ moonpool moonpool-lwt
+
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./_doc/html
+          destination_dir: .
+          enable_jekyll: false
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -8,31 +8,82 @@ on:

 jobs:
  run:
-    name: build
+    name: build  # build+test on various versions of OCaml, on linux
+    timeout-minutes: 15
    strategy:
      fail-fast: true
      matrix:
        os:
            - ubuntu-latest
-              #- macos-latest
-              #- windows-latest
        ocaml-compiler:
-            - '4.08'
-            - '4.14'
            - '5.0'
+            - '5.3'

    runs-on: ${{ matrix.os }}
    steps:
      - uses: actions/checkout@main
      - name: Use OCaml ${{ matrix.ocaml-compiler }}
-        uses: ocaml/setup-ocaml@v2
+        uses: ocaml/setup-ocaml@v3
        with:
          ocaml-compiler: ${{ matrix.ocaml-compiler }}
          dune-cache: true
+          allow-prerelease-opam: true

-      - run: opam install -t moonpool --deps-only
+      - run: opam pin picos 0.6.0 -y -n

-      - run: opam exec -- dune build '@install'
+      - run: opam install -t moonpool moonpool-lwt --deps-only
+      - run: opam exec -- dune build @install

-      - run: opam exec -- dune runtest
+      # install some depopts
+      - run: opam install thread-local-storage trace hmap
+      - run: opam exec -- dune build --profile=release --force @install @runtest
+
+  compat:
+    name: build-compat # compat with other OSes
+    timeout-minutes: 15
+    strategy:
+      fail-fast: true
+      matrix:
+        os:
+            - macos-latest
+              #- windows-latest
+        ocaml-compiler:
+            - '5.2'
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@main
+      - name: Use OCaml ${{ matrix.ocaml-compiler }}
+        uses: ocaml/setup-ocaml@v3
+        with:
+          ocaml-compiler: ${{ matrix.ocaml-compiler }}
+          dune-cache: true
+          allow-prerelease-opam: true
+
+      # temporary until it's in a release
+      - run: opam pin https://github.com/ocaml-multicore/picos.git -y -n
+
+      - run: opam install -t moonpool moonpool-lwt --deps-only
+      - run: opam exec -- dune build @install
+      # install some depopts
+      - run: opam install thread-local-storage trace domain-local-await
+      - run: opam exec -- dune build --profile=release --force @install @runtest
+
+  format:
+    name: format
+    strategy:
+      matrix:
+        ocaml-compiler:
+            - '5.3'
+    runs-on: 'ubuntu-latest'
+    steps:
+      - uses: actions/checkout@main
+      - name: Use OCaml ${{ matrix.ocaml-compiler }}
+        uses: ocaml/setup-ocaml@v3
+        with:
+          ocaml-compiler: ${{ matrix.ocaml-compiler }}
+          dune-cache: true
+          allow-prerelease-opam: true
+
+      - run: opam install ocamlformat.0.27.0
+      - run: opam exec -- make format-check

--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 _build
 _opam
+*.tmp
--- a/.ocamlformat
+++ b/.ocamlformat
@ -1,4 +1,4 @@
-version = 0.24.1
+version = 0.27.0
 profile=conventional
 margin=80
 if-then-else=k-r
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,4 +1,177 @@

+# 0.10
+
+- breaking: remove `around_task` from schedulers
+- breaking: remove `moonpool.fib` entirely. Please use `picos_std.structured`
+    if you really need structured concurrency.
+- remove deprecated moonpool-io and moonpool.sync
+
+- feat core: add `Main`, salvaged from moonpool.fib
+- block signals in background threads
+- refactor `chan`; fix bug in `Chan.try_push`
+- fix: make `Moonpool_lwt.fut_of_lwt` idempotent
+
+# 0.9
+
+- breaking: require OCaml 5
+  * no further need for a preprocessor
+  * forkjoin not longer optional
+
+- moonpool-lwt: large changes, including a Runner that runs
+    inside `Lwt_unix`'s event loop and can thus use any `_ Lwt.t` function
+- remove bounded_queue
+- fix core: better repropagating of errors
+- add `Fut.{cancel,try_cancel}`
+- perf: `await` on immediately ready timer queues its task
+- feat: add `Moonpool.yield`
+
+- deprecate moonpool.sync
+- deprecate moonpool_io
+
+# 0.8
+
+- api(fut): make alias `'a Fut.t = 'a Picos.Computation.t` public
+- feat: add `Fut.make_promise`, have `'a promise = private 'a fut`
+- feat(exn_bt): in show/pp, do print the backtrace when present
+- feat: block signals in workers if asked to
+- relax bound on picos to 0.5-0.6
+- feat fib: `spawn_ignore` now has `?on` optional param
+- change Moonpool.Chan so it's bounded (stil experimental)
+
+- fix task local storage: type was too specific
+- fix fiber: use a single fut/computation in fibers
+
+# 0.7
+
+- add `Moonpool_fiber.spawn_top_ignore`
+- add `moonpool-io`, based on `picos_io` (still very experimental)
+- move to picos as the foundation layer for concurrency primitives (#30)
+- move to `thread-local-storage` 0.2 with get/set API
+
+# 0.6
+
+- breaking: remove `Immediate_runner` (bug prone and didn't
+    handle effects). `Moonpool_fib.main` can be used to handle
+    effects in the main function.
+- remove deprecated alias `Moonpool.Pool`
+
+- feat: add structured concurrency sub-library `moonpool.fib` with
+    fibers. Fibers can use `await` and spawn other fibers that will
+    be appropriately cancelled when their parent is.
+- feat: add add `moonpool-lwt` as an experimental bridge between moonpool and lwt.
+    This allows moonpool runners to be used from within Lwt to
+    perform background computations, and conversely to call Lwt from
+    moonpool with some precautions.
+- feat: task-local storage in the main moonpool runners, available from
+    fibers and regular tasks.
+- feat: add `Exn_bt` to core
+- feat: add `Runner.dummy`
+- make `moonpool.forkjoin` optional (only on OCaml >= 5.0)
+- feat: add `Fut.Advanced.barrier_on_abstract_container_of_futures`
+- feat: add `Fut.map_list`
+
+- refactor: split off domain pool to `moonpool.dpool`
+- fix too early exit in Ws_pool
+
+# 0.5.1
+
+- fix `Ws_pool`: workers would exit before processing
+    all remaining tasks upon shutdown
+
+# 0.5
+
+## features
+
+- add `Bb_queue.transfer`
+- add `Bb_queue.to_{iter,gen,seq}`
+- add `Fifo_pool`, a simple pool with a single blocking queue for
+    workloads with coarse granularity tasks that value
+    latency (e.g. a web server)
+- add a work-stealing pool for heavy compute workloads that
+    feature a lot of await/fork-join, with a lot of help
+    from Vesa Karvonen (@polytypic)
+- add `Fut.spawn_on_current_runner`
+- add `Runner.{spawn_on_current_runner, await}`
+- add a few more toplevel aliases in `Moonpool` itself
+- add `No_runner`: a runner that runs tasks synchronously in the caller
+- on shutdown, pools will finish running all present tasks before
+    closing. New tasks are immediately rejected.
+
+- use an optional dependency on `thread-local-storage` to
+    implement work stealing and `spawn_on_current_runner`
+
+## optimizations
+
+- use the main domain to spawn threads on it. This means we can really
+    use all cores, not all but one.
+- in `Fork_join.both`, only one of the two sides schedules a task,
+    the other runs in the current thread. This reduces scheduling overhead.
+- compare to domainslib in benchmarks. With the WS pool we're now slightly
+    ahead in terms of overhead on the recursive fib benchmark.
+
+## breaking
+
+- deprecate `Pool`, now an alias to `Fifo_pool`
+- the `Fut.Infix_local` and `Fut.infix` are gone, replaced with
+    a simpler `Fut.Infix` module that tries to use the current runner
+    for intermediate tasks.
+
+# 0.4
+
+- add `Fut.{reify_error,bind_reify_error}`
+- full lifecycle for worker domains, where a domain
+    will shutdown if no thread runs on it, after a
+    short delay.
+
+- fix: generalize type of `create_arg`
+- perf: in `Bb_queue`, only signal condition on push if queue was empty
+
+# 0.3
+
+- add `Fork_join` for parallelizing computations. This is only
+    available on OCaml 5.x because it relies on effects.
+- add `Fork_join.{for_,map_array,map_list}`
+- add `Fork_join.all_{list,init}`
+- add `Pool.with_`
+- add a channel module
+- add `Runner`, change `Pool` to produce a `Runner.t`
+- add a `Lock` module
+- add support for domain-local-await when installed
+- add `Fut.await` for OCaml >= 5.0
+
+- fix: Fork_join.both_ignore now has a more general type
+
+- expose `Suspend_` and its internal effect with an unstability alert.
+    This is intended for implementors of `Runner` only.
+- port `cpp.ml` from containers, replace previous codegen with it.
+    This will provide better flexibility for supporting multiple versions
+    of OCaml in the future.
+- add `Pool.run_wait_block`; rename `Pool.run` into `Pool.run_async`
+- fix: in blocking queue, `pop` works on a non empty closed queue
+
+# 0.2
+
+- add `Fut.for_list`
+- add `around_task` to `Pool.create`
+- add `Pool.shutdown_without_waiting`
+- add `Pool.num_tasks`
+- add `Fut.is_done`
+- add `Blocking_queue.size`
+- add `Fut.for_array` to easily iterate on an array in parallel
+- add `Fut.get_or_fail{,_exn}`
+
+- perf: limit number of work queues in pool
+- perf: use multiple queues and non-blocking work-stealing from them, in pool
+    this improves the behavior for many small tasks by reducing contention on
+    each queue
+
+- fix: fut: actually run all map/bind callbacks in pool if provided
+
+# 0.1.1
+
+- fix(fut): fix bug when calling `wait_list []`
+- fix: join_array on arrays of length=1 had a bound error
+
 # 0.1

 initial release
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,9 @@
+# Code of Conduct
+
+This project has adopted the [OCaml Code of Conduct](https://github.com/ocaml/code-of-conduct/blob/main/CODE_OF_CONDUCT.md).
+
+# Enforcement
+
+This project follows the OCaml Code of Conduct
+[enforcement policy](https://github.com/ocaml/code-of-conduct/blob/main/CODE_OF_CONDUCT.md#enforcement).
+To report any violations, please contact @c-cube
--- a/61
+++ b/61
@ -9,9 +9,21 @@ clean:
 test:
 	@dune runtest $(DUNE_OPTS)

+test-autopromote:
+	@dune runtest $(DUNE_OPTS) --auto-promote
+
 doc:
 	@dune build $(DUNE_OPTS) @doc

+build-dev:
+	dune build @install @runtest $(DUNE_OPTS) --workspace=dune-workspace.dev
+
+format:
+	@dune build $(DUNE_OPTS) @fmt --auto-promote
+
+format-check:
+	@dune build $(DUNE_OPTS) @fmt --display=quiet
+
 WATCH?= @check @runtest
 watch:
 	dune build $(DUNE_OPTS) -w $(WATCH)
@ -19,13 +31,54 @@ watch:
 DUNE_OPTS_BENCH?=--profile=release

 N?=40
-NITER?=3
+NITER?=2
 BENCH_PSIZE?=1,4,8,20
+BENCH_KIND?=fifo,pool
 BENCH_CUTOFF?=20
 bench-fib:
 	@echo running for N=$(N)
 	dune build $(DUNE_OPTS_BENCH) benchs/fib_rec.exe
-	hyperfine -L psize $(BENCH_PSIZE) \
-		'./_build/default/benchs/fib_rec.exe -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize={psize} -n $(N)'

-.PHONY: test clean
+	hyperfine --warmup=1 \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -seq' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -dl' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize=20 -kind=pool -fj' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize=20 -kind=pool -await' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize=4 -kind=fifo' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize=4 -kind=pool' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize=8 -kind=fifo' \
+      './_build/default/benchs/fib_rec.exe -n $(N) -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize=16 -kind=pool'
+
+	#hyperfine -L psize $(BENCH_PSIZE) -L kind $(BENCH_KIND) --warmup=1 \
+	#	'./_build/default/benchs/fib_rec.exe -cutoff $(BENCH_CUTOFF) -niter $(NITER) -psize={psize} -kind={kind} -n $(N)'
+		#'./_build/default/benchs/fib_rec.exe -seq -cutoff $(BENCH_CUTOFF) -niter $(NITER) -n $(N)' \
+		#'./_build/default/benchs/fib_rec.exe -dl -cutoff $(BENCH_CUTOFF) -niter $(NITER) -n $(N)' \
+
+PI_NSTEPS?=100_000_000
+PI_MODES?=seq,par1,forkjoin
+PI_KIND?=fifo,pool
+bench-pi:
+	@echo running for N=$(PI_NSTEPS)
+	dune build $(DUNE_OPTS_BENCH) benchs/pi.exe
+	hyperfine --warmup=1 \
+		'./_build/default/benchs/pi.exe -n $(PI_NSTEPS) -mode=seq' \
+		'./_build/default/benchs/pi.exe -n $(PI_NSTEPS) -j 8 -mode par1 -kind=pool' \
+		'./_build/default/benchs/pi.exe -n $(PI_NSTEPS) -j 8 -mode par1 -kind=fifo' \
+		'./_build/default/benchs/pi.exe -n $(PI_NSTEPS) -j 16 -mode forkjoin -kind=pool' \
+		'./_build/default/benchs/pi.exe -n $(PI_NSTEPS) -j 20 -mode forkjoin -kind=pool'
+
+bench-repro-41:
+	dune build $(DUNE_OPTS_BENCH) examples/repro_41/run.exe
+	hyperfine --warmup=1 \
+		"./_build/default/examples/repro_41/run.exe 4 domainslib" \
+		"./_build/default/examples/repro_41/run.exe 4 moonpool" \
+		"./_build/default/examples/repro_41/run.exe 5 moonpool" \
+		"./_build/default/examples/repro_41/run.exe 5 seq"
+
+.PHONY: test clean bench-fib bench-pi
+
+VERSION=$(shell awk '/^version:/ {print $$2}' moonpool.opam)
+update_next_tag:
+	@echo "update version to $(VERSION)..."
+	sed -i "s/NEXT_VERSION/$(VERSION)/g" $(wildcard src/**/*.ml) $(wildcard src/**/*.mli)
+	sed -i "s/NEXT_RELEASE/$(VERSION)/g" $(wildcard src/*.ml) $(wildcard src/**/*.ml) $(wildcard src/*.mli) $(wildcard src/**/*.mli)
--- a/README.md
+++ b/README.md
@ -10,19 +10,102 @@ The rationale is that we should not have more domains than cores, so
 it's easier to pre-allocate exactly that many domains, and run more flexible
 thread pools on top.

-In addition, `Moonpool.Fut` provides futures/promises that execute
-on these thread pools. The futures are thread safe.
+In addition, some concurrency and parallelism primitives are provided:
+- `Moonpool.Fut` provides futures/promises that execute
+    on these thread pools. The futures are thread safe.
+- `Moonpool.Chan` provides simple cooperative and thread-safe channels
+    to use within pool-bound tasks. They're essentially re-usable futures.
+
+    Moonpool now requires OCaml 5 (meaning there's actual domains and effects, not just threads),
+    so the `Fut.await` primitive is always provided. It's simpler and more powerful
+    than the monadic combinators.
+- `Moonpool_forkjoin`, in the library `moonpool.forkjoin`
+    provides the fork-join parallelism primitives
+    to use within tasks running in the pool.

 ## Usage

-The user can create several thread pools. These pools use regular posix threads,
-but the threads are spread across multiple domains (on OCaml 5), which enables
-parallelism.
+The user can create several thread pools (implementing the interface `Runner.t`).
+These pools use regular posix threads, but the threads are spread across
+multiple domains (on OCaml 5), which enables parallelism.
+
+Current we provide these pool implementations:
+- `Fifo_pool` is a thread pool that uses a blocking queue to schedule tasks,
+    which means they're picked in the same order they've been scheduled ("fifo").
+    This pool is simple and will behave fine for coarse-granularity concurrency,
+    but will slow down under heavy contention.
+- `Ws_pool` is a work-stealing pool, where each thread has its own local queue
+    in addition to a global queue of tasks. This is efficient for workloads
+    with many short tasks that spawn other tasks, but the order in which
+    tasks are run is less predictable. This is useful when throughput is
+    the important thing to optimize.
+
+The function `Runner.run_async pool task` schedules `task()` to run on one of
+the workers of `pool`, as soon as one is available. No result is returned by `run_async`.

 ```ocaml
-# let pool = Moonpool.Pool.create ~min:4 ();;
-val pool : Moonpool.Pool.t = <abstr>
+# #require "threads";;
+# let pool = Moonpool.Fifo_pool.create ~num_threads:4 ();;
+val pool : Moonpool.Runner.t = <abstr>

+# begin
+   Moonpool.Runner.run_async pool
+    (fun () ->
+        Thread.delay 0.1;
+        print_endline "running from the pool");
+   print_endline "running from the caller";
+   Thread.delay 0.3; (* wait for task to run before returning *)
+  end ;;
+running from the caller
+running from the pool
+- : unit = ()
+```
+
+To wait until the task is done, you can use `Runner.run_wait_block`[^1] instead:
+
+[^1]: beware of deadlock! See documentation for more details.
+
+```ocaml
+# begin
+   Moonpool.Runner.run_wait_block pool
+    (fun () ->
+        Thread.delay 0.1;
+        print_endline "running from the pool");
+   print_endline "running from the caller (after waiting)";
+  end ;;
+running from the pool
+running from the caller (after waiting)
+- : unit = ()
+```
+
+The function `Fut.spawn ~on f` schedules `f ()` on the pool `on`, and immediately
+returns a _future_ which will eventually hold the result (or an exception).
+
+The function `Fut.peek` will return the current value, or `None` if the future is
+still not completed.
+The functions `Fut.wait_block` and `Fut.wait_block_exn` will
+block the current thread and wait for the future to complete.
+There are some deadlock risks associated with careless use of these, so
+be sure to consult the documentation of the `Fut` module.
+
+```ocaml
+# let fut = Moonpool.Fut.spawn ~on:pool
+    (fun () ->
+       Thread.delay 0.5;
+       1+1);;
+val fut : int Moonpool.Fut.t = <abstr>
+
+# Moonpool.Fut.peek fut;
+- : int Moonpool.Fut.or_error option = None
+
+# Moonpool.Fut.wait_block_exn fut;;
+- : int = 2
+```
+
+Some combinators on futures are also provided, e.g. to wait for all futures in
+an array to complete:
+
+```ocaml
 # let rec fib x =
    if x <= 1 then 1 else fib (x-1) + fib (x-2);;
 val fib : int -> int = <fun>
@ -46,20 +129,182 @@ Ok
   514229; 832040; 1346269; 2178309; 3524578; 5702887; 9227465|]
 ```

+### Support for `await`
+
+On OCaml 5, effect handlers can be used to implement `Fut.await : 'a Fut.t -> 'a`.
+
+The expression `Fut.await some_fut`, when run from inside some thread pool,
+suspends its caller task; the suspended task is then parked, and will
+be resumed when the future is completed.
+The pool worker that was executing this expression, in the mean time, moves
+on to another task.
+This means that `await` is free of the deadlock risks associated with
+`Fut.wait_block`.
+
+In the following example, we bypass the need for `Fut.join_array` by simply
+using regular array functions along with `Fut.await`.
+
+```ocaml
+# let main_fut =
+    let open Moonpool.Fut in
+    spawn ~on:pool @@ fun () ->
+    (* array of sub-futures *)
+    let tasks: _ Moonpool.Fut.t array = Array.init 100 (fun i ->
+       spawn ~on:pool (fun () ->
+           Thread.delay 0.01;
+           i+1))
+    in
+    Array.fold_left (fun n fut -> n + await fut) 0 tasks
+  ;;
+val main_fut : int Moonpool.Fut.t = <abstr>
+
+# let expected_sum = Array.init 100 (fun i->i+1) |> Array.fold_left (+) 0;;
+val expected_sum : int = 5050
+
+# assert (expected_sum = Moonpool.Fut.wait_block_exn main_fut);;
+- : unit = ()
+```
+
+### Errors
+
+We have a `Exn_bt.t` type that comes in handy in many places. It bundles together
+an exception and the backtrace associated with the place the exception was caught.
+
+### Local storage
+
+Moonpool, via picos, provides _task local storage_ (like thread-local storage, but per task).
+The storage is convenient for carrying around context for cross-cutting concerns such
+as logging or tracing (e.g. a log tag for the current user or request ID, or a tracing
+scope).
+
+### Fork-join
+
+The sub-library `moonpool.forkjoin`
+provides a module `Moonpool_forkjoin`
+implements the [fork-join model](https://en.wikipedia.org/wiki/Fork%E2%80%93join_model).
+It must run on a pool (using `Runner.run_async` or inside a future via `Fut.spawn`).
+
+It is generally better to use the work-stealing pool for workloads that rely on
+fork-join for better performance, because fork-join will tend to spawn lots of
+shorter tasks.
+
+Here is an simple example of a parallel sort.
+It uses selection sort for small slices, like this:
+
+```ocaml
+# let rec select_sort arr i len =
+    if len >= 2 then ( 
+      let idx = ref i in
+      for j = i+1 to i+len-1 do
+        if arr.(j) < arr.(!idx) then idx := j
+      done;
+      let tmp = arr.(!idx) in
+      arr.(!idx) <- arr.(i);
+      arr.(i) <- tmp;
+      select_sort arr (i+1) (len-1)
+    );;
+val select_sort : 'a array -> int -> int -> unit = <fun>
+```
+
+And a parallel quicksort for larger slices:
+
+```ocaml
+# let rec quicksort arr i len : unit =
+    if len <= 10 then select_sort arr i len
+    else (
+      let pivot = arr.(i + (len / 2)) in
+      let low = ref (i - 1) in
+      let high = ref (i + len) in
+
+      (* partition the array slice *)
+      while !low < !high do
+        incr low;
+        decr high;
+        while arr.(!low) < pivot do
+          incr low
+        done;
+        while arr.(!high) > pivot do
+          decr high
+        done;
+        if !low < !high then (
+          let tmp = arr.(!low) in
+          arr.(!low) <- arr.(!high);
+          arr.(!high) <- tmp
+        )
+      done;
+
+      (* sort lower half and upper half in parallel *)
+      Moonpool_forkjoin.both_ignore
+        (fun () -> quicksort arr i (!low - i))
+        (fun () -> quicksort arr !low (len - (!low - i)))
+    );;
+val quicksort : 'a array -> int -> int -> unit = <fun>
+
+
+# let arr = [| 4;2;1;5;1;10;3 |];;
+val arr : int array = [|4; 2; 1; 5; 1; 10; 3|]
+# Moonpool.Fut.spawn
+    ~on:pool (fun () -> quicksort arr 0 (Array.length arr))
+    |> Moonpool.Fut.wait_block_exn;;
+- : unit = ()
+# arr;;
+- : int array = [|1; 1; 2; 3; 4; 5; 10|]
+
+
+# let arr =
+    let rand = Random.State.make [| 42 |] in
+    Array.init 40 (fun _-> Random.State.int rand 300);;
+val arr : int array =
+  [|64; 220; 247; 196; 51; 186; 22; 106; 58; 58; 11; 161; 243; 111; 74; 109;
+    49; 135; 59; 192; 132; 38; 19; 44; 126; 147; 182; 83; 95; 231; 204; 121;
+    142; 255; 72; 85; 95; 93; 73; 202|]
+# Moonpool.Fut.spawn ~on:pool
+    (fun () -> quicksort arr 0 (Array.length arr))
+  |> Moonpool.Fut.wait_block_exn
+  ;;
+- : unit = ()
+# arr;;
+- : int array =
+[|11; 19; 22; 38; 44; 49; 51; 58; 58; 59; 64; 72; 73; 74; 83; 85; 93; 95; 95;
+  106; 109; 111; 121; 126; 132; 135; 142; 147; 161; 182; 186; 192; 196; 202;
+  204; 220; 231; 243; 247; 255|]
+```
+
+Note that the sort had to be started in a task (via `Moonpool.Fut.spawn`)
+so that fork-join would run on the thread pool.
+This is necessary even for the initial iteration because fork-join
+relies on OCaml 5's effects, meaning that the computation needs to run
+inside an effect handler provided by the thread pool.
+
+### More intuition
+
+To quote [gasche](https://discuss.ocaml.org/t/ann-moonpool-0-1/12387/15):
+
+<blockquote>
+You are assuming that, if pool P1 has 5000 tasks, and pool P2 has 10 other tasks, then these 10 tasks will get to run faster than if we just added them at the end of pool P1. This sounds like a “fairness” assumption: separate pools will get comparable shares of domain compute ressources, or at least no pool will be delayed too much from running their first tasks.
+
+[…]
+
+- each pool uses a fixed number of threads, all running simultaneously; if there are more tasks sent to the pool, they are delayed and will only get one of the pool threads when previous tasks have finished
+- separate pools run their separate threads simultaneously, so they compete for compute resources on their domain using OCaml’s systhreads scheduler – which does provide fairness in practice
+- as a result, running in a new pool enables quicker completion than adding to an existing pool (as we will be scheduled right away instead of waiting for previous tasks in our pool to free some threads)
+- the ratio of compute resources that each pool gets should be roughly proportional to its number of worker threads
+</blockquote>
+
 ## OCaml versions

-This works for OCaml >= 4.08.
- On OCaml 4.xx, there are no domains, so this is just a library for regular thread pools
-    with not actual parallelism (except for threads that call C code that releases the runtime lock, that is).
- on OCaml 5.xx, there is a fixed pool of domains (using the recommended domain count).
-    These domains do not do much by themselves, but we schedule new threads on them, and group
-    threads from each domain into pools.
-    Each domain might thus have multiple threads that belong to distinct pools (and several threads from
-    the same pool, too — this is useful for threads blocking on IO).
+This works for OCaml >= 5.00.

-    A useful analogy is that each domain is a bit like a CPU core, and `Thread.t` is a logical thread running on a core.
-    Multiple threads have to share a single core and do not run in parallel on it[^1].
-    We can therefore build pools that spread their worker threads on multiple cores to enable parallelism within each pool.
+Internally, there is a fixed pool of domains (using the recommended domain count).
+These domains do not do much by themselves, but we schedule new threads on them, and form pools
+of threads that contain threads from each domain.
+Each domain might thus have multiple threads that belong to distinct pools (and several threads from
+the same pool, too — this is useful for threads blocking on IO); Each pool will have threads
+running on distinct domains, which enables parallelism.
+
+A useful analogy is that each domain is a bit like a CPU core, and `Thread.t` is a logical thread running on a core.
+Multiple threads have to share a single core and do not run in parallel on it[^2].
+We can therefore build pools that spread their worker threads on multiple cores to enable parallelism within each pool.

 TODO: actually use https://github.com/haesbaert/ocaml-processor to pin domains to cores,
 possibly optionally using `select` in dune.
@ -74,4 +319,5 @@ MIT license.
 $ opam install moonpool
 ```

-[^1]: let's not talk about hyperthreading.
+[^2]: ignoring hyperthreading for the sake of the analogy.
+
--- a/bench_fib.sh
+++ b/bench_fib.sh
@ -0,0 +1,3 @@
+#!/bin/sh
+OPTS="--profile=release --display=quiet"
+exec dune exec $OPTS -- benchs/fib_rec.exe $@
--- a/bench_pi.sh
+++ b/bench_pi.sh
@ -0,0 +1,3 @@
+#!/bin/sh
+OPTS="--profile=release --display=quiet"
+exec dune exec $OPTS -- benchs/pi.exe $@
--- a/bench_primes.sh
+++ b/bench_primes.sh
@ -0,0 +1,3 @@
+#!/bin/sh
+OPTS="--profile=release --display=quiet"
+exec dune exec $OPTS -- benchs/primes.exe $@
--- a/benchs/dune
+++ b/benchs/dune
@ -1,4 +1,3 @@
-
 (executables
- (names fib_rec)
- (libraries moonpool))
+ (names fib_rec pi primes)
+ (libraries moonpool moonpool.forkjoin unix trace trace-tef domainslib))
--- a/benchs/fib_rec.ml
+++ b/benchs/fib_rec.ml
@ -1,4 +1,8 @@
 open Moonpool
+module Trace = Trace_core
+module FJ = Moonpool_forkjoin
+
+let ( let@ ) = ( @@ )

 let rec fib_direct x =
  if x <= 1 then
@ -12,42 +16,129 @@ let rec fib ~on x : int Fut.t =
  if x <= !cutoff then
    Fut.spawn ~on (fun () -> fib_direct x)
  else
-    let open Fut.Infix_local in
+    let open Fut.Infix in
    let+ t1 = fib ~on (x - 1) and+ t2 = fib ~on (x - 2) in
    t1 + t2

+let fib_fj ~on x : int Fut.t =
+  let rec fib_rec x : int =
+    if x <= !cutoff then
+      fib_direct x
+    else (
+      let n1, n2 =
+        FJ.both (fun () -> fib_rec (x - 1)) (fun () -> fib_rec (x - 2))
+      in
+      n1 + n2
+    )
+  in
+  Fut.spawn ~on (fun () -> fib_rec x)
+
+let fib_await ~on x : int Fut.t =
+  let rec fib_rec x : int Fut.t =
+    if x <= !cutoff then
+      Fut.spawn ~on (fun () -> fib_direct x)
+    else
+      Fut.spawn ~on (fun () ->
+          let n1 = fib_rec (x - 1) in
+          let n2 = fib_rec (x - 2) in
+          let n1 = Fut.await n1 in
+          let n2 = Fut.await n2 in
+          n1 + n2)
+  in
+  fib_rec x
+
+let rec fib_dl ~pool x : int Domainslib.Task.promise =
+  if x <= !cutoff then
+    Domainslib.Task.async pool (fun () -> fib_direct x)
+  else
+    Domainslib.Task.async pool (fun () ->
+        let t1 = fib_dl ~pool (x - 1) and t2 = fib_dl ~pool (x - 2) in
+        let t1 = Domainslib.Task.await pool t1 in
+        let t2 = Domainslib.Task.await pool t2 in
+        t1 + t2)
+
 let () = assert (List.init 10 fib_direct = [ 1; 1; 2; 3; 5; 8; 13; 21; 34; 55 ])

-let run ~psize ~n ~seq ~niter () : unit =
-  let pool = lazy (Pool.create ~min:psize ()) in
+let create_pool ~psize ~kind () =
+  match kind with
+  | "fifo" -> Fifo_pool.create ?num_threads:psize ()
+  | "pool" -> Ws_pool.create ?num_threads:psize ()
+  | _ -> assert false
+
+let str_of_int_opt = function
+  | None -> "None"
+  | Some i -> Printf.sprintf "Some %d" i
+
+let run ~psize ~n ~seq ~dl ~fj ~await ~niter ~kind () : unit =
+  let@ _sp = Trace.with_span ~__FILE__ ~__LINE__ "fib.run" in
+  let pool = lazy (create_pool ~kind ~psize ()) in
+  let dl_pool =
+    lazy
+      (let n = Domain.recommended_domain_count () in
+       Printf.printf "use %d domains\n%!" n;
+       Domainslib.Task.setup_pool ~num_domains:n ())
+  in
  for _i = 1 to niter do
    let res =
      if seq then (
        Printf.printf "compute fib %d sequentially\n%!" n;
        fib_direct n
+      ) else if dl then (
+        Printf.printf "compute fib %d with domainslib\n%!" n;
+        let (lazy pool) = dl_pool in
+        Domainslib.Task.run pool (fun () ->
+            Domainslib.Task.await pool @@ fib_dl ~pool n)
+      ) else if fj then (
+        Printf.printf "compute fib %d using fork-join with pool size=%s\n%!" n
+          (str_of_int_opt psize);
+        fib_fj ~on:(Lazy.force pool) n |> Fut.wait_block_exn
+      ) else if await then (
+        Printf.printf "compute fib %d using await with pool size=%s\n%!" n
+          (str_of_int_opt psize);
+        fib_await ~on:(Lazy.force pool) n |> Fut.wait_block_exn
      ) else (
-        Printf.printf "compute fib %d with pool size=%d\n%!" n psize;
+        Printf.printf "compute fib %d with pool size=%s\n%!" n
+          (str_of_int_opt psize);
        fib ~on:(Lazy.force pool) n |> Fut.wait_block_exn
      )
    in
    Printf.printf "fib %d = %d\n%!" n res
-  done
+  done;
+
+  if seq then
+    ()
+  else if dl then
+    Domainslib.Task.teardown_pool (Lazy.force dl_pool)
+  else
+    Ws_pool.shutdown (Lazy.force pool)

 let () =
+  let@ () = Trace_tef.with_setup () in
  let n = ref 40 in
-  let psize = ref 16 in
+  let psize = ref None in
  let seq = ref false in
  let niter = ref 3 in
+  let kind = ref "pool" in
+  let dl = ref false in
+  let await = ref false in
+  let fj = ref false in
  let opts =
    [
-      "-psize", Arg.Set_int psize, " pool size";
+      "-psize", Arg.Int (fun i -> psize := Some i), " pool size";
      "-n", Arg.Set_int n, " fib <n>";
      "-seq", Arg.Set seq, " sequential";
+      "-dl", Arg.Set dl, " domainslib";
+      "-fj", Arg.Set fj, " fork join";
      "-niter", Arg.Set_int niter, " number of iterations";
+      "-await", Arg.Set await, " use await";
      "-cutoff", Arg.Set_int cutoff, " cutoff for sequential computation";
+      ( "-kind",
+        Arg.Symbol ([ "pool"; "fifo" ], ( := ) kind),
+        " pick pool implementation" );
    ]
    |> Arg.align
  in

  Arg.parse opts ignore "";
-  run ~psize:!psize ~n:!n ~seq:!seq ~niter:!niter ()
+  run ~psize:!psize ~n:!n ~fj:!fj ~seq:!seq ~await:!await ~dl:!dl ~niter:!niter
+    ~kind:!kind ()
--- a/benchs/pi.ml
+++ b/benchs/pi.ml
@ -0,0 +1,146 @@
+(* compute Pi *)
+
+open Moonpool
+module FJ = Moonpool_forkjoin
+
+let ( let@ ) = ( @@ )
+let j = ref 0
+let spf = Printf.sprintf
+
+let run_sequential (num_steps : int) : float =
+  let@ _sp = Trace.with_span ~__FILE__ ~__LINE__ "pi.seq" in
+  let step = 1. /. float num_steps in
+  let sum = ref 0. in
+  for i = 0 to num_steps - 1 do
+    let x = (float i +. 0.5) *. step in
+    sum := !sum +. (4. /. (1. +. (x *. x)))
+  done;
+  let pi = step *. !sum in
+  pi
+
+(** Create a pool *)
+let with_pool ~kind f =
+  match kind with
+  | "pool" ->
+    if !j = 0 then
+      Ws_pool.with_ f
+    else
+      Ws_pool.with_ ~num_threads:!j f
+  | "fifo" ->
+    if !j = 0 then
+      Fifo_pool.with_ f
+    else
+      Fifo_pool.with_ ~num_threads:!j f
+  | _ -> assert false
+
+(** Run in parallel using {!Fut.for_} *)
+let run_par1 ~kind (num_steps : int) : float =
+  let@ pool = with_pool ~kind () in
+
+  let num_tasks = Ws_pool.size pool in
+
+  let step = 1. /. float num_steps in
+  let global_sum = Lock.create 0. in
+
+  (* one chunk of the work *)
+  let run_task _idx_task : unit =
+    let@ _sp =
+      Trace.with_span ~__FILE__ ~__LINE__ "pi.slice" ~data:(fun () ->
+          [ "i", `Int _idx_task ])
+    in
+
+    let sum = ref 0. in
+    let i = ref 0 in
+    while !i < num_steps do
+      let x = (float !i +. 0.5) *. step in
+      sum := !sum +. (4. /. (1. +. (x *. x)));
+      (* next iteration *) i := !i + num_tasks
+    done;
+
+    let sum = !sum in
+    Lock.update global_sum (fun x -> x +. sum)
+  in
+
+  Fut.wait_block_exn @@ Fut.for_ ~on:pool num_tasks run_task;
+
+  let pi = step *. Lock.get global_sum in
+  pi
+
+let run_fork_join ~kind num_steps : float =
+  let@ pool = with_pool ~kind () in
+
+  let num_tasks = Ws_pool.size pool in
+
+  let step = 1. /. float num_steps in
+  let global_sum = Lock.create 0. in
+
+  Ws_pool.run_wait_block pool (fun () ->
+      FJ.for_
+        ~chunk_size:(3 + (num_steps / num_tasks))
+        num_steps
+        (fun low high ->
+          let sum = ref 0. in
+          for i = low to high do
+            let x = (float i +. 0.5) *. step in
+            sum := !sum +. (4. /. (1. +. (x *. x)))
+          done;
+          let sum = !sum in
+          Lock.update global_sum (fun n -> n +. sum)));
+
+  let pi = step *. Lock.get global_sum in
+  pi
+
+type mode =
+  | Sequential
+  | Par1
+  | Fork_join
+
+let () =
+  let@ () = Trace_tef.with_setup () in
+  Trace.set_thread_name "main";
+  let@ _sp = Trace.with_span ~__FILE__ ~__LINE__ "main" in
+
+  let mode = ref Sequential in
+  let n = ref 1000 in
+  let time = ref false in
+  let kind = ref "pool" in
+
+  let set_mode = function
+    | "seq" -> mode := Sequential
+    | "par1" -> mode := Par1
+    | "forkjoin" -> mode := Fork_join
+    | _s -> failwith (spf "unknown mode %S" _s)
+  in
+
+  let opts =
+    [
+      "-n", Arg.Set_int n, " number of steps";
+      ( "-mode",
+        Arg.Symbol ([ "seq"; "par1"; "forkjoin" ], set_mode),
+        " mode of execution" );
+      "-j", Arg.Set_int j, " number of threads";
+      "-t", Arg.Set time, " printing timing";
+      ( "-kind",
+        Arg.Symbol ([ "pool"; "fifo" ], ( := ) kind),
+        " pick pool implementation" );
+    ]
+    |> Arg.align
+  in
+  Arg.parse opts ignore "";
+
+  let t_start = Unix.gettimeofday () in
+  let res =
+    match !mode with
+    | Sequential -> run_sequential !n
+    | Par1 -> run_par1 ~kind:!kind !n
+    | Fork_join -> run_fork_join ~kind:!kind !n
+  in
+  let elapsed : float = Unix.gettimeofday () -. t_start in
+
+  Printf.printf "pi=%.6f (pi=%.6f, diff=%.3f)%s\n%!" res Float.pi
+    (abs_float (Float.pi -. res))
+    (if !time then
+       spf " in %.4fs" elapsed
+     else
+       "");
+  ()
--- a/benchs/primes.ml
+++ b/benchs/primes.ml
@ -0,0 +1,60 @@
+let ( let@ ) = ( @@ )
+let spf = Printf.sprintf
+
+let generate' chan =
+  for i = 2 to Int.max_int do
+    Moonpool.Chan.push chan i
+  done
+
+let filter' in_chan out_chan prime =
+  let rec loop () =
+    let n = Moonpool.Chan.pop in_chan in
+    if n mod prime <> 0 then Moonpool.Chan.push out_chan n;
+    loop ()
+  in
+  loop ()
+
+let main ~chan_size ~n ~on_prime () : unit =
+  let@ runner = Moonpool.Ws_pool.with_ () in
+  let@ () = Moonpool.Ws_pool.run_wait_block runner in
+  let primes = ref @@ Moonpool.Chan.create ~max_size:chan_size () in
+  Moonpool.run_async runner
+    (let chan = !primes in
+     fun () -> generate' chan);
+
+  for _i = 1 to n do
+    let prime = Moonpool.Chan.pop !primes in
+    on_prime prime;
+    let filtered_chan = Moonpool.Chan.create ~max_size:chan_size () in
+    Moonpool.run_async runner
+      (let in_chan = !primes in
+       fun () -> filter' in_chan filtered_chan prime);
+    primes := filtered_chan
+  done
+
+let () =
+  let n = ref 10_000 in
+  let chan_size = ref 0 in
+  let time = ref true in
+  let opts =
+    [
+      "-n", Arg.Set_int n, " number of iterations";
+      "--no-time", Arg.Clear time, " do not compute time";
+      "--chan-size", Arg.Set_int chan_size, " channel size";
+    ]
+    |> Arg.align
+  in
+  Arg.parse opts ignore "";
+  Printf.printf "computing %d primes\n%!" !n;
+
+  let t_start = Unix.gettimeofday () in
+
+  let n_primes = Atomic.make 0 in
+  main ~n:!n ~chan_size:!chan_size ~on_prime:(fun _ -> Atomic.incr n_primes) ();
+
+  let elapsed : float = Unix.gettimeofday () -. t_start in
+  Printf.printf "computed %d primes%s\n%!" (Atomic.get n_primes)
+    (if !time then
+       spf " in %.4fs" elapsed
+     else
+       "")
--- a/10
+++ b/10
@ -1,5 +1,9 @@
-
 (env
-  (_ (flags :standard -strict-sequence -warn-error -a+8 -w +a-4-40-70)))
+ (_
+  (flags :standard -strict-sequence -warn-error -a+8 -w +a-4-40-42-70)))

-(mdx (libraries moonpool))
+(mdx
+ (libraries moonpool moonpool.forkjoin threads)
+ (package moonpool)
+ (enabled_if
+  (>= %{ocaml_version} 5.0)))
--- a/60
+++ b/60
@ -1,13 +1,20 @@
 (lang dune 3.0)
+
 (using mdx 0.2)

 (name moonpool)
-(version 0.1)
+
+(version 0.10)
+
 (generate_opam_files true)
+
 (source
 (github c-cube/moonpool))
+
 (authors "Simon Cruanes")
+
 (maintainers "Simon Cruanes")
+
 (license MIT)

 ;(documentation https://url/to/documentation)
@ -16,15 +23,60 @@
 (name moonpool)
 (synopsis "Pools of threads supported by a pool of domains")
 (depends
-  (ocaml (>= 4.08))
+  (ocaml
+   (>= 5.0))
  dune
-  either
+  (either
+   (>= 1.0))
+  (trace :with-test)
+  (trace-tef :with-test)
+  (qcheck-core
+   (and
+    :with-test
+    (>= 0.19)))
+  (thread-local-storage
+   (and
+    (>= 0.2)
+    (< 0.3)))
  (odoc :with-doc)
+  (hmap :with-test)
+  (picos
+   (and
+    (>= 0.5)
+    (< 0.7)))
+  (picos_std
+   (and
+    (>= 0.5)
+    (< 0.7)))
  (mdx
   (and
    (>= 1.9.0)
    :with-test)))
+ (depopts
+  hmap
+  (trace
+   (>= 0.6)))
 (tags
-  (thread pool domain)))
+  (thread pool domain futures fork-join)))
+
+(package
+ (name moonpool-lwt)
+ (synopsis "Event loop for moonpool based on Lwt-engine (experimental)")
+ (allow_empty) ; on < 5.0
+ (depends
+  (moonpool
+   (= :version))
+  (ocaml
+   (>= 5.0))
+  (qcheck-core
+   (and
+    :with-test
+    (>= 0.19)))
+  (hmap :with-test)
+  lwt
+  base-unix
+  (trace :with-test)
+  (trace-tef :with-test)
+  (odoc :with-doc)))

 ; See the complete stanza docs at https://dune.readthedocs.io/en/stable/dune-files.html#dune-project
--- a/dune-workspace.dev
+++ b/dune-workspace.dev
@ -0,0 +1,4 @@
+(lang dune 3.0)
+;(context (opam (switch 4.08.1)))
+(context (opam (switch 4.14.0)))
+(context (opam (switch 5.0.0)))
--- a/examples/discuss1.ml
+++ b/examples/discuss1.ml
@ -0,0 +1,27 @@
+(** NOTE: this was an example from
+    https://discuss.ocaml.org/t/confused-about-moonpool-cancellation/15381 but
+    there is no cancelation anymore :) *)
+
+let ( let@ ) = ( @@ )
+
+let () =
+  let@ () = Trace_tef.with_setup () in
+  let@ _ = Moonpool.main in
+
+  (* let@ runner = Moonpool.Ws_pool.with_ () in *)
+  let@ runner = Moonpool.Background_thread.with_ () in
+
+  (* Pretend this is some long-running read loop *)
+  for i = 1 to 10 do
+    Printf.printf "MAIN LOOP %d\n%!" i;
+    let _ : _ Moonpool.Fut.t =
+      Moonpool.Fut.spawn ~on:runner (fun () ->
+          Printf.printf "RUN FIBER %d\n%!" i;
+          Format.printf "FIBER %d NOT CANCELLED YET@." i;
+          failwith "BOOM")
+    in
+    Moonpool.Fut.yield ();
+    (* Thread.delay 0.2; *)
+    (* Thread.delay 0.0001; *)
+    ()
+  done
--- a/examples/dune
+++ b/examples/dune
@ -0,0 +1,11 @@
+(executables
+ (names discuss1)
+ (enabled_if
+  (>= %{ocaml_version} 5.0))
+ ;(package moonpool)
+ (libraries
+  moonpool
+  trace
+  trace-tef
+  ;tracy-client.trace
+  ))
--- a/examples/repro_41/dune
+++ b/examples/repro_41/dune
@ -0,0 +1,5 @@
+(executables
+ (names run)
+ (enabled_if
+  (>= %{ocaml_version} 5.0))
+ (libraries moonpool trace trace-tef domainslib))
--- a/examples/repro_41/run.ml
+++ b/examples/repro_41/run.ml
@ -0,0 +1,54 @@
+(* fibo.ml *)
+let cutoff = 25
+let input = 40
+
+let rec fibo_seq n =
+  if n <= 1 then
+    n
+  else
+    fibo_seq (n - 1) + fibo_seq (n - 2)
+
+let rec fibo_domainslib ctx n =
+  if n <= cutoff then
+    fibo_seq n
+  else
+    let open Domainslib in
+    let fut1 = Task.async ctx (fun () -> fibo_domainslib ctx (n - 1)) in
+    let fut2 = Task.async ctx (fun () -> fibo_domainslib ctx (n - 2)) in
+    Task.await ctx fut1 + Task.await ctx fut2
+
+let rec fibo_moonpool ctx n =
+  if n <= cutoff then
+    fibo_seq n
+  else
+    let open Moonpool in
+    let fut1 = Fut.spawn ~on:ctx (fun () -> fibo_moonpool ctx (n - 1)) in
+    let fut2 = Fut.spawn ~on:ctx (fun () -> fibo_moonpool ctx (n - 2)) in
+    Fut.await fut1 + Fut.await fut2
+
+let usage =
+  "fibo.exe <num_domains> [ domainslib | moonpool | moonpool_fifo | seq ]"
+
+let num_domains = try int_of_string Sys.argv.(1) with _ -> failwith usage
+let implem = try Sys.argv.(2) with _ -> failwith usage
+
+let () =
+  let output =
+    match implem with
+    | "moonpool" ->
+      let open Moonpool in
+      let ctx = Ws_pool.create ~num_threads:num_domains () in
+      Ws_pool.run_wait_block ctx (fun () -> fibo_moonpool ctx input)
+    | "moonpool_fifo" ->
+      let open Moonpool in
+      let ctx = Fifo_pool.create ~num_threads:num_domains () in
+      Ws_pool.run_wait_block ctx (fun () -> fibo_moonpool ctx input)
+    | "domainslib" ->
+      let open Domainslib in
+      let pool = Task.setup_pool ~num_domains () in
+      Task.run pool (fun () -> fibo_domainslib pool input)
+    | "seq" -> fibo_seq input
+    | _ -> failwith usage
+  in
+  print_int output;
+  print_newline ()
--- a/moonpool-lwt.opam
+++ b/moonpool-lwt.opam
@ -0,0 +1,36 @@
+# This file is generated by dune, edit dune-project instead
+opam-version: "2.0"
+version: "0.10"
+synopsis: "Event loop for moonpool based on Lwt-engine (experimental)"
+maintainer: ["Simon Cruanes"]
+authors: ["Simon Cruanes"]
+license: "MIT"
+homepage: "https://github.com/c-cube/moonpool"
+bug-reports: "https://github.com/c-cube/moonpool/issues"
+depends: [
+  "dune" {>= "3.0"}
+  "moonpool" {= version}
+  "ocaml" {>= "5.0"}
+  "qcheck-core" {with-test & >= "0.19"}
+  "hmap" {with-test}
+  "lwt"
+  "base-unix"
+  "trace" {with-test}
+  "trace-tef" {with-test}
+  "odoc" {with-doc}
+]
+build: [
+  ["dune" "subst"] {dev}
+  [
+    "dune"
+    "build"
+    "-p"
+    name
+    "-j"
+    jobs
+    "@install"
+    "@runtest" {with-test}
+    "@doc" {with-doc}
+  ]
+]
+dev-repo: "git+https://github.com/c-cube/moonpool.git"
--- a/moonpool.opam
+++ b/moonpool.opam
@ -1,20 +1,31 @@
 # This file is generated by dune, edit dune-project instead
 opam-version: "2.0"
-version: "0.1"
+version: "0.10"
 synopsis: "Pools of threads supported by a pool of domains"
 maintainer: ["Simon Cruanes"]
 authors: ["Simon Cruanes"]
 license: "MIT"
-tags: ["thread" "pool" "domain"]
+tags: ["thread" "pool" "domain" "futures" "fork-join"]
 homepage: "https://github.com/c-cube/moonpool"
 bug-reports: "https://github.com/c-cube/moonpool/issues"
 depends: [
-  "ocaml" {>= "4.08"}
+  "ocaml" {>= "5.0"}
  "dune" {>= "3.0"}
-  "either"
+  "either" {>= "1.0"}
+  "trace" {with-test}
+  "trace-tef" {with-test}
+  "qcheck-core" {with-test & >= "0.19"}
+  "thread-local-storage" {>= "0.2" & < "0.3"}
  "odoc" {with-doc}
+  "hmap" {with-test}
+  "picos" {>= "0.5" & < "0.7"}
+  "picos_std" {>= "0.5" & < "0.7"}
  "mdx" {>= "1.9.0" & with-test}
 ]
+depopts: [
+  "hmap"
+  "trace" {>= "0.6"}
+]
 build: [
  ["dune" "subst"] {dev}
  [
--- a/src/bb_queue.mli
+++ b/src/bb_queue.mli
@ -1,32 +0,0 @@
-(** Basic Blocking Queue *)
-
-type 'a t
-
-val create : unit -> _ t
-
-exception Closed
-
-val push : 'a t -> 'a -> unit
-(** [push q x] pushes [x] into [q], and returns [()].
-    @raise Closed if [close q] was previously called.*)
-
-val pop : 'a t -> 'a
-(** [pop q] pops the next element in [q]. It might block until an element comes.
-   @raise Closed if the queue was closed before a new element was available. *)
-
-val try_pop : force_lock:bool -> 'a t -> 'a option
-(** [try_pop q] immediately pops the first element of [q], if any,
-    or returns [None] without blocking.
-    @param force_lock if true, use {!Mutex.lock} (which can block under contention);
-      if false, use {!Mutex.try_lock}, which might return [None] even in
-    presence of an element if there's contention *)
-
-val try_push : 'a t -> 'a -> bool
-(** [try_push q x] tries to push into [q], in which case
-    it returns [true]; or it fails to push and returns [false]
-    without blocking.
-    @raise Closed if the locking succeeded but the queue is closed.
-*)
-
-val close : _ t -> unit
-(** Close the queue, meaning there won't be any more [push] allowed. *)
--- a/src/core/background_thread.ml
+++ b/src/core/background_thread.ml
@ -0,0 +1,20 @@
+include Runner
+
+let ( let@ ) = ( @@ )
+
+type ('a, 'b) create_args =
+  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+  ?name:string ->
+  'a
+(** Arguments used in {!create}. See {!create} for explanations. *)
+
+let create ?on_init_thread ?on_exit_thread ?on_exn ?name () : t =
+  Fifo_pool.create ?on_init_thread ?on_exit_thread ?on_exn ?name ~num_threads:1
+    ()
+
+let with_ ?on_init_thread ?on_exit_thread ?on_exn ?name () f =
+  let pool = create ?on_init_thread ?on_exit_thread ?on_exn ?name () in
+  let@ () = Fun.protect ~finally:(fun () -> shutdown pool) in
+  f pool
--- a/src/core/background_thread.mli
+++ b/src/core/background_thread.mli
@ -0,0 +1,23 @@
+(** A simple runner with a single background thread.
+
+    Because this is guaranteed to have a single worker thread, tasks scheduled
+    in this runner always run asynchronously but in a sequential fashion.
+
+    This is similar to {!Fifo_pool} with exactly one thread.
+
+    @since 0.6 *)
+
+include module type of Runner
+
+type ('a, 'b) create_args =
+  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+  ?name:string ->
+  'a
+(** Arguments used in {!create}. See {!create} for explanations. *)
+
+val create : (unit -> t, _) create_args
+(** Create the background runner *)
+
+val with_ : (unit -> (t -> 'a) -> 'a, _) create_args
--- a/src/core/bb_queue.ml
+++ b/src/core/bb_queue.ml
@ -29,18 +29,21 @@ let push (self : _ t) x : unit =
    Mutex.unlock self.mutex;
    raise Closed
  ) else (
+    let was_empty = Queue.is_empty self.q in
    Queue.push x self.q;
-    Condition.signal self.cond;
+    if was_empty then Condition.broadcast self.cond;
    Mutex.unlock self.mutex
  )

 let pop (self : 'a t) : 'a =
  Mutex.lock self.mutex;
  let rec loop () =
-    if self.closed then (
-      Mutex.unlock self.mutex;
-      raise Closed
-    ) else if Queue.is_empty self.q then (
+    if Queue.is_empty self.q then (
+      if self.closed then (
+        Mutex.unlock self.mutex;
+        raise Closed
+      );
+
      Condition.wait self.cond self.mutex;
      (loop [@tailcall]) ()
    ) else (
@ -77,9 +80,56 @@ let try_push (self : _ t) x : bool =
      raise Closed
    );

+    let was_empty = Queue.is_empty self.q in
    Queue.push x self.q;
-    Condition.signal self.cond;
+    if was_empty then Condition.broadcast self.cond;
    Mutex.unlock self.mutex;
    true
  ) else
    false
+
+let size (self : _ t) : int =
+  Mutex.lock self.mutex;
+  let n = Queue.length self.q in
+  Mutex.unlock self.mutex;
+  n
+
+let transfer (self : 'a t) q2 : unit =
+  Mutex.lock self.mutex;
+  let continue = ref true in
+  while !continue do
+    if Queue.is_empty self.q then (
+      if self.closed then (
+        Mutex.unlock self.mutex;
+        raise Closed
+      );
+      Condition.wait self.cond self.mutex
+    ) else (
+      Queue.transfer self.q q2;
+      Mutex.unlock self.mutex;
+      continue := false
+    )
+  done
+
+type 'a gen = unit -> 'a option
+type 'a iter = ('a -> unit) -> unit
+
+let to_iter self k =
+  try
+    while true do
+      let x = pop self in
+      k x
+    done
+  with Closed -> ()
+
+let to_gen self : _ gen =
+ fun () ->
+  match pop self with
+  | exception Closed -> None
+  | x -> Some x
+
+let rec to_seq self : _ Seq.t =
+ fun () ->
+  match pop self with
+  | exception Closed -> Seq.Nil
+  | x -> Seq.Cons (x, to_seq self)
--- a/src/core/bb_queue.mli
+++ b/src/core/bb_queue.mli
@ -0,0 +1,79 @@
+(** Basic Blocking Queue *)
+
+type 'a t
+
+val create : unit -> _ t
+
+exception Closed
+
+val push : 'a t -> 'a -> unit
+(** [push q x] pushes [x] into [q], and returns [()].
+    @raise Closed if [close q] was previously called.*)
+
+val size : _ t -> int
+(** Number of items currently in the queue.
+    @since 0.2 *)
+
+val pop : 'a t -> 'a
+(** [pop q] pops the next element in [q]. It might block until an element comes.
+    @raise Closed if the queue was closed before a new element was available. *)
+
+val try_pop : force_lock:bool -> 'a t -> 'a option
+(** [try_pop q] immediately pops the first element of [q], if any, or returns
+    [None] without blocking.
+    @param force_lock
+      if true, use {!Mutex.lock} (which can block under contention); if false,
+      use {!Mutex.try_lock}, which might return [None] even in presence of an
+      element if there's contention *)
+
+val try_push : 'a t -> 'a -> bool
+(** [try_push q x] tries to push into [q], in which case it returns [true]; or
+    it fails to push and returns [false] without blocking.
+    @raise Closed if the locking succeeded but the queue is closed. *)
+
+val transfer : 'a t -> 'a Queue.t -> unit
+(** [transfer bq q2] transfers all items presently in [bq] into [q2] in one
+    atomic section, and clears [bq]. It blocks if no element is in [bq].
+
+    This is useful to consume elements from the queue in batch. Create a
+    [Queue.t] locally:
+
+    {[
+      let dowork (work_queue : job Bb_queue.t) =
+        (* local queue, not thread safe *)
+        let local_q = Queue.create () in
+        try
+          while true do
+            (* work on local events, already on this thread *)
+            while not (Queue.is_empty local_q) do
+              let job = Queue.pop local_q in
+              process_job job
+            done;
+
+            (* get all the events in the incoming blocking queue, in
+               one single critical section. *)
+            Bb_queue.transfer work_queue local_q
+          done
+        with Bb_queue.Closed -> ()
+    ]}
+
+    @since 0.4 *)
+
+val close : _ t -> unit
+(** Close the queue, meaning there won't be any more [push] allowed. *)
+
+type 'a gen = unit -> 'a option
+type 'a iter = ('a -> unit) -> unit
+
+val to_iter : 'a t -> 'a iter
+(** [to_iter q] returns an iterator over all items in the queue. This might not
+    terminate if [q] is never closed.
+    @since 0.4 *)
+
+val to_gen : 'a t -> 'a gen
+(** [to_gen q] returns a generator from the queue.
+    @since 0.4 *)
+
+val to_seq : 'a t -> 'a Seq.t
+(** [to_gen q] returns a (transient) sequence from the queue.
+    @since 0.4 *)
--- a/src/core/chan.ml
+++ b/src/core/chan.ml
@ -0,0 +1,123 @@
+exception Closed
+
+type 'a t = {
+  q: 'a Queue.t;
+  mutex: Mutex.t;  (** protects critical section *)
+  mutable closed: bool;
+  max_size: int;
+  push_waiters: Trigger.t Queue.t;
+  pop_waiters: Trigger.t Queue.t;
+}
+
+let create ~max_size () : _ t =
+  if max_size < 0 then invalid_arg "Chan: max_size < 0";
+  {
+    max_size;
+    mutex = Mutex.create ();
+    closed = false;
+    q = Queue.create ();
+    push_waiters = Queue.create ();
+    pop_waiters = Queue.create ();
+  }
+
+let try_push (self : _ t) x : bool =
+  if Mutex.try_lock self.mutex then (
+    if self.closed then (
+      Mutex.unlock self.mutex;
+      raise Closed
+    );
+
+    match Queue.length self.q with
+    | 0 ->
+      let to_awake = Queue.create () in
+      Queue.push x self.q;
+      Queue.transfer self.pop_waiters to_awake;
+      Mutex.unlock self.mutex;
+      (* wake up pop triggers if needed. Be careful to do that
+         outside the critical section*)
+      Queue.iter Trigger.signal to_awake;
+      true
+    | n when n < self.max_size ->
+      Queue.push x self.q;
+      Mutex.unlock self.mutex;
+      true
+    | _ ->
+      Mutex.unlock self.mutex;
+      false
+  ) else
+    false
+
+let try_pop (type elt) self : elt option =
+  if Mutex.try_lock self.mutex then (
+    match Queue.pop self.q with
+    | exception Queue.Empty ->
+      Mutex.unlock self.mutex;
+      if self.closed then
+        raise Closed
+      else
+        None
+    | x ->
+      Mutex.unlock self.mutex;
+      Some x
+  ) else
+    None
+
+let close (self : _ t) : unit =
+  let triggers_to_signal = Queue.create () in
+  Mutex.lock self.mutex;
+  if not self.closed then (
+    self.closed <- true;
+    Queue.transfer self.pop_waiters triggers_to_signal;
+    Queue.transfer self.push_waiters triggers_to_signal
+  );
+  Mutex.unlock self.mutex;
+  Queue.iter Trigger.signal triggers_to_signal
+
+let rec push (self : _ t) x : unit =
+  Mutex.lock self.mutex;
+
+  if self.closed then (
+    Mutex.unlock self.mutex;
+    raise Closed
+  );
+
+  match Queue.length self.q with
+  | 0 ->
+    Queue.push x self.q;
+    let to_wakeup = Queue.create () in
+    Queue.transfer self.pop_waiters to_wakeup;
+    Mutex.unlock self.mutex;
+    Queue.iter Trigger.signal to_wakeup
+  | n when n < self.max_size ->
+    Queue.push x self.q;
+    Mutex.unlock self.mutex
+  | _ ->
+    let tr = Trigger.create () in
+    Queue.push tr self.push_waiters;
+    Mutex.unlock self.mutex;
+    Trigger.await_exn tr;
+    push self x
+
+let rec pop (self : 'a t) : 'a =
+  Mutex.lock self.mutex;
+  match Queue.pop self.q with
+  | x ->
+    if Queue.is_empty self.q then (
+      let to_wakeup = Queue.create () in
+      Queue.transfer self.push_waiters to_wakeup;
+      Mutex.unlock self.mutex;
+      Queue.iter Trigger.signal to_wakeup
+    ) else
+      Mutex.unlock self.mutex;
+    x
+  | exception Queue.Empty ->
+    if self.closed then (
+      Mutex.unlock self.mutex;
+      raise Closed
+    );
+
+    let tr = Trigger.create () in
+    Queue.push tr self.pop_waiters;
+    Mutex.unlock self.mutex;
+    Trigger.await_exn tr;
+    pop self
--- a/src/core/chan.mli
+++ b/src/core/chan.mli
@ -0,0 +1,49 @@
+(** Channels.
+
+    The channels have bounded size. They use effects/await to provide
+    a direct style implementation. Pushing into a full channel,
+    or popping from an empty one, will suspend the current task.
+
+    The channels became bounded since @0.7 .
+*)
+
+type 'a t
+(** Channel carrying values of type ['a]. *)
+
+val create : max_size:int -> unit -> 'a t
+(** Create a channel. *)
+
+exception Closed
+
+val try_push : 'a t -> 'a -> bool
+(** [try_push chan x] pushes [x] into [chan]. This does not block. Returns
+    [true] if it succeeded in pushing.
+    @raise Closed if the channel is closed. *)
+
+val try_pop : 'a t -> 'a option
+(** [try_pop chan] pops and return an element if one is available immediately.
+    Otherwise it returns [None].
+    @raise Closed if the channel is closed and empty. *)
+
+val close : _ t -> unit
+(** Close the channel. Further push and pop calls will fail. This is idempotent.
+*)
+
+val push : 'a t -> 'a -> unit
+(** Push the value into the channel, suspending the current task if the channel
+    is currently full.
+    @raise Closed if the channel is closed
+    @since 0.7 *)
+
+val pop : 'a t -> 'a
+(** Pop an element. This might suspend the current task if the channel is
+    currently empty.
+    @raise Closed if the channel is empty and closed.
+    @since 0.7 *)
+
+(*
+val pop_block_exn : 'a t -> 'a
+(** Like [pop], but blocks if an element is not available immediately.
+    The precautions around blocking from inside a thread pool
+    are the same as explained in {!Fut.wait_block}. *)
+*)
--- a/src/core/dune
+++ b/src/core/dune
@ -0,0 +1,15 @@
+(library
+ (public_name moonpool)
+ (name moonpool)
+ (libraries
+  moonpool.private
+  (re_export thread-local-storage)
+  (select
+   hmap_ls_.ml
+   from
+   (hmap -> hmap_ls_.real.ml)
+   (-> hmap_ls_.dummy.ml))
+  moonpool.dpool
+  (re_export picos))
+ (flags :standard -open Moonpool_private)
+ (private_modules util_pool_))
--- a/src/core/exn_bt.ml
+++ b/src/core/exn_bt.ml
@ -0,0 +1,30 @@
+type t = exn * Printexc.raw_backtrace
+
+let[@inline] make exn bt : t = exn, bt
+let[@inline] exn (e, _) = e
+let[@inline] bt (_, bt) = bt
+
+let show self =
+  let bt = Printexc.raw_backtrace_to_string (bt self) in
+  let exn = Printexc.to_string (exn self) in
+  if bt = "" then
+    exn
+  else
+    Printf.sprintf "%s\n%s" exn bt
+
+let pp out self = Format.pp_print_string out (show self)
+let[@inline] raise (e, bt) = Printexc.raise_with_backtrace e bt
+
+let[@inline] get exn =
+  let bt = Printexc.get_raw_backtrace () in
+  make exn bt
+
+let[@inline] get_callstack n exn =
+  let bt = Printexc.get_callstack n in
+  make exn bt
+
+type nonrec 'a result = ('a, t) result
+
+let[@inline] unwrap = function
+  | Ok x -> x
+  | Error ebt -> raise ebt
--- a/src/core/exn_bt.mli
+++ b/src/core/exn_bt.mli
@ -0,0 +1,29 @@
+(** Exception with backtrace.
+
+    Type changed @since 0.7
+
+    @since 0.6 *)
+
+(** An exception bundled with a backtrace *)
+
+type t = exn * Printexc.raw_backtrace
+
+val exn : t -> exn
+val bt : t -> Printexc.raw_backtrace
+val raise : t -> 'a
+val get : exn -> t
+val get_callstack : int -> exn -> t
+
+val make : exn -> Printexc.raw_backtrace -> t
+(** Trivial builder *)
+
+val show : t -> string
+(** Simple printing *)
+
+val pp : Format.formatter -> t -> unit
+
+type nonrec 'a result = ('a, t) result
+
+val unwrap : 'a result -> 'a
+(** [unwrap (Ok x)] is [x], [unwrap (Error ebt)] re-raises [ebt].
+    @since 0.7 *)
--- a/src/core/fifo_pool.ml
+++ b/src/core/fifo_pool.ml
@ -0,0 +1,184 @@
+open Types_
+include Runner
+module WL = Worker_loop_
+
+type fiber = Picos.Fiber.t
+type task_full = WL.task_full
+
+let ( let@ ) = ( @@ )
+
+type state = {
+  threads: Thread.t array;
+  q: task_full Bb_queue.t;  (** Queue for tasks. *)
+  mutable as_runner: t;
+  (* init options *)
+  name: string option;
+  on_init_thread: dom_id:int -> t_id:int -> unit -> unit;
+  on_exit_thread: dom_id:int -> t_id:int -> unit -> unit;
+  on_exn: exn -> Printexc.raw_backtrace -> unit;
+}
+(** internal state *)
+
+type worker_state = {
+  idx: int;
+  dom_idx: int;
+  st: state;
+}
+
+let[@inline] size_ (self : state) = Array.length self.threads
+let[@inline] num_tasks_ (self : state) : int = Bb_queue.size self.q
+
+(*
+get_thread_state = TLS.get_opt k_worker_state
+  *)
+
+let default_thread_init_exit_ ~dom_id:_ ~t_id:_ () = ()
+
+let shutdown_ ~wait (self : state) : unit =
+  Bb_queue.close self.q;
+  if wait then Array.iter Thread.join self.threads
+
+type ('a, 'b) create_args =
+  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+  ?num_threads:int ->
+  ?name:string ->
+  'a
+
+(** Run [task] as is, on the pool. *)
+let schedule_ (self : state) (task : task_full) : unit =
+  try Bb_queue.push self.q task with Bb_queue.Closed -> raise Shutdown
+
+let runner_of_state (pool : state) : t =
+  let run_async ~fiber f = schedule_ pool @@ T_start { f; fiber } in
+  Runner.For_runner_implementors.create
+    ~shutdown:(fun ~wait () -> shutdown_ pool ~wait)
+    ~run_async
+    ~size:(fun () -> size_ pool)
+    ~num_tasks:(fun () -> num_tasks_ pool)
+    ()
+
+(** Run [task] as is, on the pool. *)
+let schedule_w (self : worker_state) (task : task_full) : unit =
+  try Bb_queue.push self.st.q task with Bb_queue.Closed -> raise Shutdown
+
+let get_next_task (self : worker_state) =
+  try Bb_queue.pop self.st.q with Bb_queue.Closed -> raise WL.No_more_tasks
+
+let before_start (self : worker_state) =
+  let t_id = Thread.id @@ Thread.self () in
+  self.st.on_init_thread ~dom_id:self.dom_idx ~t_id ();
+
+  (* set thread name *)
+  Option.iter
+    (fun name ->
+      Tracing_.set_thread_name (Printf.sprintf "%s.worker.%d" name self.idx))
+    self.st.name
+
+let cleanup (self : worker_state) : unit =
+  (* on termination, decrease refcount of underlying domain *)
+  Domain_pool_.decr_on self.dom_idx;
+  let t_id = Thread.id @@ Thread.self () in
+  self.st.on_exit_thread ~dom_id:self.dom_idx ~t_id ()
+
+let worker_ops : worker_state WL.ops =
+  let runner (st : worker_state) = st.st.as_runner in
+  let on_exn (st : worker_state) (ebt : Exn_bt.t) =
+    st.st.on_exn (Exn_bt.exn ebt) (Exn_bt.bt ebt)
+  in
+  {
+    WL.schedule = schedule_w;
+    runner;
+    get_next_task;
+    on_exn;
+    before_start;
+    cleanup;
+  }
+
+let create_ ?(on_init_thread = default_thread_init_exit_)
+    ?(on_exit_thread = default_thread_init_exit_) ?(on_exn = fun _ _ -> ())
+    ~threads ?name () : state =
+  let self =
+    {
+      threads;
+      q = Bb_queue.create ();
+      as_runner = Runner.dummy;
+      name;
+      on_init_thread;
+      on_exit_thread;
+      on_exn;
+    }
+  in
+  self.as_runner <- runner_of_state self;
+  self
+
+let create ?on_init_thread ?on_exit_thread ?on_exn ?num_threads ?name () : t =
+  let num_domains = Domain_pool_.max_number_of_domains () in
+
+  (* number of threads to run *)
+  let num_threads = Util_pool_.num_threads ?num_threads () in
+
+  (* make sure we don't bias towards the first domain(s) in {!D_pool_} *)
+  let offset = Random.int num_domains in
+
+  let pool =
+    let dummy_thread = Thread.self () in
+    let threads = Array.make num_threads dummy_thread in
+    create_ ?on_init_thread ?on_exit_thread ?on_exn ~threads ?name ()
+  in
+  let runner = runner_of_state pool in
+
+  (* temporary queue used to obtain thread handles from domains
+     on which the thread are started. *)
+  let receive_threads = Bb_queue.create () in
+
+  (* start the thread with index [i] *)
+  let start_thread_with_idx i =
+    let dom_idx = (offset + i) mod num_domains in
+
+    (* function called in domain with index [i], to
+       create the thread and push it into [receive_threads] *)
+    let create_thread_in_domain () =
+      let st = { idx = i; dom_idx; st = pool } in
+      let thread =
+        Thread.create (WL.worker_loop ~block_signals:true ~ops:worker_ops) st
+      in
+      (* send the thread from the domain back to us *)
+      Bb_queue.push receive_threads (i, thread)
+    in
+
+    Domain_pool_.run_on dom_idx create_thread_in_domain
+  in
+
+  (* start all threads, placing them on the domains
+     according to their index and [offset] in a round-robin fashion. *)
+  for i = 0 to num_threads - 1 do
+    start_thread_with_idx i
+  done;
+
+  (* receive the newly created threads back from domains *)
+  for _j = 1 to num_threads do
+    let i, th = Bb_queue.pop receive_threads in
+    pool.threads.(i) <- th
+  done;
+
+  runner
+
+let with_ ?on_init_thread ?on_exit_thread ?on_exn ?num_threads ?name () f =
+  let pool =
+    create ?on_init_thread ?on_exit_thread ?on_exn ?num_threads ?name ()
+  in
+  let@ () = Fun.protect ~finally:(fun () -> shutdown pool) in
+  f pool
+
+module Private_ = struct
+  type nonrec worker_state = worker_state
+
+  let worker_ops = worker_ops
+  let runner_of_state (self : worker_state) = worker_ops.runner self
+
+  let create_single_threaded_state ~thread ?on_exn () : worker_state =
+    let st : state = create_ ?on_exn ~threads:[| thread |] () in
+    { idx = 0; dom_idx = 0; st }
+end
--- a/src/core/fifo_pool.mli
+++ b/src/core/fifo_pool.mli
@ -0,0 +1,58 @@
+(** A simple thread pool in FIFO order.
+
+    FIFO: first-in, first-out. Basically tasks are put into a queue, and worker
+    threads pull them out of the queue at the other end.
+
+    Since this uses a single blocking queue to manage tasks, it's very simple
+    and reliable. The number of worker threads is fixed, but they are spread
+    over several domains to enable parallelism.
+
+    This can be useful for latency-sensitive applications (e.g. as a pool of
+    workers for network servers). Work-stealing pools might have higher
+    throughput but they're very unfair to some tasks; by contrast, here, older
+    tasks have priority over younger tasks.
+
+    @since 0.5 *)
+
+include module type of Runner
+
+type ('a, 'b) create_args =
+  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+  ?num_threads:int ->
+  ?name:string ->
+  'a
+(** Arguments used in {!create}. See {!create} for explanations. *)
+
+val create : (unit -> t, _) create_args
+(** [create ()] makes a new thread pool.
+    @param on_init_thread
+      called at the beginning of each new thread in the pool.
+    @param num_threads
+      number of worker threads. See {!Ws_pool.create} for more details.
+    @param on_exit_thread called at the end of each worker thread in the pool.
+    @param name name for the pool, used in tracing (since 0.6) *)
+
+val with_ : (unit -> (t -> 'a) -> 'a, _) create_args
+(** [with_ () f] calls [f pool], where [pool] is obtained via {!create}. When
+    [f pool] returns or fails, [pool] is shutdown and its resources are
+    released. Most parameters are the same as in {!create}. *)
+
+(**/**)
+
+module Private_ : sig
+  type worker_state
+
+  val worker_ops : worker_state Worker_loop_.ops
+
+  val create_single_threaded_state :
+    thread:Thread.t ->
+    ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+    unit ->
+    worker_state
+
+  val runner_of_state : worker_state -> Runner.t
+end
+
+(**/**)
--- a/src/core/fut.ml
+++ b/src/core/fut.ml
@ -0,0 +1,456 @@
+module A = Atomic
+module C = Picos.Computation
+
+type 'a or_error = ('a, Exn_bt.t) result
+type 'a waiter = 'a or_error -> unit
+type 'a t = 'a C.t
+type 'a promise = 'a t
+
+let[@inline] make_promise () : _ t =
+  let fut = C.create ~mode:`LIFO () in
+  fut
+
+let make () =
+  let fut = make_promise () in
+  fut, fut
+
+let[@inline] return x : _ t = C.returned x
+let[@inline] cancel x ebt = C.cancel x (fst ebt) (snd ebt)
+let[@inline] try_cancel x ebt = C.try_cancel x (fst ebt) (snd ebt)
+
+let[@inline] fail exn bt : _ t =
+  let fut = C.create () in
+  C.cancel fut exn bt;
+  fut
+
+let[@inline] fail_exn_bt ebt = fail (Exn_bt.exn ebt) (Exn_bt.bt ebt)
+
+let[@inline] of_result = function
+  | Ok x -> return x
+  | Error ebt -> fail_exn_bt ebt
+
+let[@inline] is_resolved self : bool = not (C.is_running self)
+let is_done = is_resolved
+let peek : 'a t -> _ option = C.peek
+let raise_if_failed : _ t -> unit = C.check
+
+let[@inline] is_success self =
+  match C.peek_exn self with
+  | _ -> true
+  | exception _ -> false
+
+let is_failed : _ t -> bool = C.is_canceled
+
+exception Not_ready
+
+let[@inline] get_or_fail self =
+  match C.peek self with
+  | Some x -> x
+  | None -> raise Not_ready
+
+let[@inline] get_or_fail_exn self =
+  match C.peek_exn self with
+  | x -> x
+  | exception C.Running -> raise Not_ready
+
+let[@inline] peek_or_assert_ (self : 'a t) : 'a =
+  match C.peek_exn self with
+  | x -> x
+  | exception C.Running -> assert false
+
+let on_result_cb_ _tr f self : unit =
+  match peek_or_assert_ self with
+  | x -> f (Ok x)
+  | exception exn ->
+    let ebt = Exn_bt.get exn in
+    f (Error ebt)
+
+let on_result (self : _ t) (f : _ waiter) : unit =
+  let trigger =
+    (Trigger.from_action f self on_result_cb_ [@alert "-handler"])
+  in
+  if not (C.try_attach self trigger) then on_result_cb_ () f self
+
+let on_result_ignore_cb_ _tr f (self : _ t) =
+  f (Picos.Computation.canceled self)
+
+let on_result_ignore (self : _ t) f : unit =
+  if Picos.Computation.is_running self then (
+    let trigger =
+      (Trigger.from_action f self on_result_ignore_cb_ [@alert "-handler"])
+    in
+    if not (C.try_attach self trigger) then on_result_ignore_cb_ () f self
+  ) else
+    on_result_ignore_cb_ () f self
+
+let[@inline] fulfill_idempotent self r =
+  match r with
+  | Ok x -> C.return self x
+  | Error ebt -> C.cancel self (Exn_bt.exn ebt) (Exn_bt.bt ebt)
+
+exception Already_fulfilled
+
+let fulfill (self : _ t) (r : _ result) : unit =
+  let ok =
+    match r with
+    | Ok x -> C.try_return self x
+    | Error ebt -> C.try_cancel self (Exn_bt.exn ebt) (Exn_bt.bt ebt)
+  in
+  if not ok then raise Already_fulfilled
+
+(* ### combinators ### *)
+
+let spawn ~on f : _ t =
+  let fut = make_promise () in
+
+  let task () =
+    try
+      let res = f () in
+      C.return fut res
+    with exn ->
+      let bt = Printexc.get_raw_backtrace () in
+      C.cancel fut exn bt
+  in
+
+  Runner.run_async on task;
+  fut
+
+let spawn_on_current_runner f : _ t =
+  match Runner.get_current_runner () with
+  | None -> failwith "Fut.spawn_on_current_runner: not running on a runner"
+  | Some on -> spawn ~on f
+
+let reify_error (f : 'a t) : 'a or_error t =
+  match peek f with
+  | Some res -> return res
+  | None ->
+    let fut = make_promise () in
+    on_result f (fun r -> fulfill fut (Ok r));
+    fut
+
+let[@inline] get_runner_ ?on () : Runner.t option =
+  match on with
+  | Some _ as r -> r
+  | None -> Runner.get_current_runner ()
+
+let map ?on ~f fut : _ t =
+  let map_immediate_ r : _ result =
+    match r with
+    | Ok x ->
+      (try Ok (f x)
+       with exn ->
+         let bt = Printexc.get_raw_backtrace () in
+         Error (Exn_bt.make exn bt))
+    | Error e_bt -> Error e_bt
+  in
+
+  match peek fut, get_runner_ ?on () with
+  | Some res, None -> of_result @@ map_immediate_ res
+  | Some res, Some runner ->
+    let fut2, promise = make () in
+    Runner.run_async runner (fun () -> fulfill promise @@ map_immediate_ res);
+    fut2
+  | None, None ->
+    let fut2, promise = make () in
+    on_result fut (fun res -> fulfill promise @@ map_immediate_ res);
+    fut2
+  | None, Some runner ->
+    let fut2, promise = make () in
+    on_result fut (fun res ->
+        Runner.run_async runner (fun () ->
+            fulfill promise @@ map_immediate_ res));
+    fut2
+
+let join (fut : 'a t t) : 'a t =
+  match peek fut with
+  | Some (Ok f) -> f
+  | Some (Error ebt) -> fail_exn_bt ebt
+  | None ->
+    let fut2, promise = make () in
+    on_result fut (function
+      | Ok sub_fut -> on_result sub_fut (fulfill promise)
+      | Error _ as e -> fulfill promise e);
+    fut2
+
+let bind ?on ~f fut : _ t =
+  let apply_f_to_res r : _ t =
+    match r with
+    | Ok x ->
+      (try f x
+       with e ->
+         let bt = Printexc.get_raw_backtrace () in
+         fail e bt)
+    | Error ebt -> fail_exn_bt ebt
+  in
+
+  let bind_and_fulfill (r : _ result) promise () : unit =
+    let f_res_fut = apply_f_to_res r in
+    (* forward result *)
+    on_result f_res_fut (fun r -> fulfill promise r)
+  in
+
+  match peek fut, get_runner_ ?on () with
+  | Some res, Some runner ->
+    let fut2, promise = make () in
+    Runner.run_async runner (bind_and_fulfill res promise);
+    fut2
+  | Some res, None -> apply_f_to_res res
+  | None, Some runner ->
+    let fut2, promise = make () in
+    on_result fut (fun r ->
+        Runner.run_async runner (bind_and_fulfill r promise));
+    fut2
+  | None, None ->
+    let fut2, promise = make () in
+    on_result fut (fun res -> bind_and_fulfill res promise ());
+    fut2
+
+let[@inline] bind_reify_error ?on ~f fut : _ t = bind ?on ~f (reify_error fut)
+
+let update_atomic_ (st : 'a A.t) f : 'a =
+  let rec loop () =
+    let x = A.get st in
+    let y = f x in
+    if A.compare_and_set st x y then
+      y
+    else (
+      Domain_.relax ();
+      loop ()
+    )
+  in
+  loop ()
+
+let both a b : _ t =
+  match peek a, peek b with
+  | Some (Ok x), Some (Ok y) -> return (x, y)
+  | Some (Error ebt), _ | _, Some (Error ebt) -> fail_exn_bt ebt
+  | _ ->
+    let fut, promise = make () in
+
+    let st = A.make `Neither in
+    on_result a (function
+      | Error err -> fulfill_idempotent promise (Error err)
+      | Ok x ->
+        (match
+           update_atomic_ st (function
+             | `Neither -> `Left x
+             | `Right y -> `Both (x, y)
+             | _ -> assert false)
+         with
+        | `Both (x, y) -> fulfill promise (Ok (x, y))
+        | _ -> ()));
+    on_result b (function
+      | Error err -> fulfill_idempotent promise (Error err)
+      | Ok y ->
+        (match
+           update_atomic_ st (function
+             | `Left x -> `Both (x, y)
+             | `Neither -> `Right y
+             | _ -> assert false)
+         with
+        | `Both (x, y) -> fulfill promise (Ok (x, y))
+        | _ -> ()));
+    fut
+
+let choose a b : _ t =
+  match peek a, peek b with
+  | Some (Ok x), _ -> return (Either.Left x)
+  | _, Some (Ok y) -> return (Either.Right y)
+  | Some (Error ebt), Some (Error _) -> fail_exn_bt ebt
+  | _ ->
+    let fut, promise = make () in
+
+    let one_failure = A.make false in
+    on_result a (function
+      | Error err ->
+        if A.exchange one_failure true then
+          (* the other one failed already *)
+          fulfill_idempotent promise (Error err)
+      | Ok x -> fulfill_idempotent promise (Ok (Either.Left x)));
+    on_result b (function
+      | Error err ->
+        if A.exchange one_failure true then
+          (* the other one failed already *)
+          fulfill_idempotent promise (Error err)
+      | Ok y -> fulfill_idempotent promise (Ok (Either.Right y)));
+    fut
+
+let choose_same a b : _ t =
+  match peek a, peek b with
+  | Some (Ok x), _ -> return x
+  | _, Some (Ok y) -> return y
+  | Some (Error ebt), Some (Error _) -> fail_exn_bt ebt
+  | _ ->
+    let fut, promise = make () in
+
+    let one_failure = A.make false in
+    on_result a (function
+      | Error err ->
+        if A.exchange one_failure true then
+          fulfill_idempotent promise (Error err)
+      | Ok x -> fulfill_idempotent promise (Ok x));
+    on_result b (function
+      | Error err ->
+        if A.exchange one_failure true then
+          fulfill_idempotent promise (Error err)
+      | Ok y -> fulfill_idempotent promise (Ok y));
+    fut
+
+let barrier_on_abstract_container_of_futures ~iter ~len ~aggregate_results cont
+    : _ t =
+  let n_items = len cont in
+  if n_items = 0 then (
+    (* no items, return now. *)
+    let cont_empty = aggregate_results (fun _ -> assert false) cont in
+    return cont_empty
+  ) else (
+    let fut, promise = make () in
+    let missing = A.make n_items in
+
+    (* callback called when a future in [a] is resolved *)
+    let on_res = function
+      | None ->
+        let n = A.fetch_and_add missing (-1) in
+        if n = 1 then (
+          (* last future, we know they all succeeded, so resolve [fut] *)
+          let res = aggregate_results peek_or_assert_ cont in
+          fulfill promise (Ok res)
+        )
+      | Some e_bt ->
+        (* immediately cancel all other [on_res] *)
+        let n = A.exchange missing 0 in
+        if n > 0 then
+          (* we're the only one to set to 0, so we can fulfill [fut]
+             with an error. *)
+          fulfill promise (Error e_bt)
+    in
+
+    iter (fun fut -> on_result_ignore fut on_res) cont;
+    fut
+  )
+
+module Advanced = struct
+  let barrier_on_abstract_container_of_futures =
+    barrier_on_abstract_container_of_futures
+end
+
+let join_array (a : _ t array) : _ array t =
+  match Array.length a with
+  | 0 -> return [||]
+  | 1 -> map ?on:None a.(0) ~f:(fun x -> [| x |])
+  | _ ->
+    barrier_on_abstract_container_of_futures ~len:Array.length
+      ~aggregate_results:Array.map ~iter:Array.iter a
+
+let join_list (l : _ t list) : _ list t =
+  match l with
+  | [] -> return []
+  | [ x ] -> map ?on:None x ~f:(fun x -> [ x ])
+  | _ ->
+    barrier_on_abstract_container_of_futures ~len:List.length
+      ~aggregate_results:List.map ~iter:List.iter l
+
+let[@inline] map_list ~f l : _ list t = List.map f l |> join_list
+
+let wait_array (a : _ t array) : unit t =
+  barrier_on_abstract_container_of_futures a ~iter:Array.iter ~len:Array.length
+    ~aggregate_results:(fun _f _ -> ())
+
+let wait_list (a : _ t list) : unit t =
+  barrier_on_abstract_container_of_futures a ~iter:List.iter ~len:List.length
+    ~aggregate_results:(fun _f _ -> ())
+
+let for_ ~on n f : unit t =
+  barrier_on_abstract_container_of_futures
+    ~len:(fun () -> n)
+    ~iter:(fun yield () ->
+      for i = 0 to n - 1 do
+        yield (spawn ~on (fun () -> f i))
+      done)
+    ~aggregate_results:(fun _f () -> ())
+    ()
+
+let for_array ~on arr f : unit t =
+  for_ ~on (Array.length arr) (fun i -> f i arr.(i))
+
+let for_list ~on l f : unit t =
+  let futs = List.rev_map (fun x -> spawn ~on (fun () -> f x)) l in
+  wait_list futs
+
+(* ### blocking ### *)
+
+let push_queue_ _tr q () = Bb_queue.push q ()
+
+let wait_block_exn (self : 'a t) : 'a =
+  match C.peek_exn self with
+  | x -> x (* fast path *)
+  | exception C.Running ->
+    let real_block () =
+      (* use queue only once *)
+      let q = Bb_queue.create () in
+
+      let trigger = Trigger.create () in
+      let attached =
+        (Trigger.on_signal trigger q () push_queue_ [@alert "-handler"])
+      in
+      assert attached;
+
+      (* blockingly wait for trigger if computation didn't complete in the mean time *)
+      if C.try_attach self trigger then Bb_queue.pop q;
+
+      (* trigger was signaled! computation must be done*)
+      peek_or_assert_ self
+    in
+
+    (* TODO: use backoff? *)
+    (* a bit of spinning before we block *)
+    let rec loop i =
+      if i = 0 then
+        real_block ()
+      else (
+        match C.peek_exn self with
+        | x -> x
+        | exception C.Running ->
+          Domain_.relax ();
+          (loop [@tailcall]) (i - 1)
+      )
+    in
+    loop 50
+
+let wait_block self =
+  match wait_block_exn self with
+  | x -> Ok x
+  | exception exn ->
+    let bt = Printexc.get_raw_backtrace () in
+    Error (Exn_bt.make exn bt)
+
+let await (self : 'a t) : 'a =
+  (* fast path: peek *)
+  match C.peek_exn self with
+  | res -> res
+  | exception C.Running ->
+    let trigger = Trigger.create () in
+    (* suspend until the future is resolved *)
+    if C.try_attach self trigger then Trigger.await_exn trigger;
+
+    (* un-suspended: we should have a result! *)
+    get_or_fail_exn self
+
+let yield = Picos.Fiber.yield
+
+module Infix = struct
+  let[@inline] ( >|= ) x f = map ~f x
+  let[@inline] ( >>= ) x f = bind ~f x
+  let ( let+ ) = ( >|= )
+  let ( let* ) = ( >>= )
+  let ( and+ ) = both
+  let ( and* ) = both
+end
+
+include Infix
+module Infix_local = Infix [@@deprecated "use Infix"]
+
+module Private_ = struct
+  let[@inline] unsafe_promise_of_fut x = x
+  let[@inline] as_computation self = self
+end
--- a/src/core/fut.mli
+++ b/src/core/fut.mli
@ -0,0 +1,332 @@
+(** Futures.
+
+    A future of type ['a t] represents the result of a computation that will
+    yield a value of type ['a].
+
+    Typically, the computation is running on a thread pool {!Runner.t} and will
+    proceed on some worker. Once set, a future cannot change. It either succeeds
+    (storing a [Ok x] with [x: 'a]), or fail (storing a [Error (exn, bt)] with
+    an exception and the corresponding backtrace).
+
+    Using {!spawn}, it's possible to start a bunch of tasks, obtaining futures,
+    and then use {!await} to get their result in the desired order.
+
+    Combinators such as {!map} and {!join_array} can be used to produce futures
+    from other futures (in a monadic way). Some combinators take a [on] argument
+    to specify a runner on which the intermediate computation takes place; for
+    example [map ~on:pool ~f fut] maps the value in [fut] using function [f],
+    applicatively; the call to [f] happens on the runner [pool] (once [fut]
+    resolves successfully with a value). Be aware that these combinators do not
+    preserve local storage. *)
+
+type 'a or_error = ('a, Exn_bt.t) result
+
+type 'a t = 'a Picos.Computation.t
+(** A future with a result of type ['a]. *)
+
+type 'a promise = private 'a t
+(** A promise, which can be fulfilled exactly once to set the corresponding
+    future. This is a private alias of ['a t] since 0.7, previously it was
+    opaque. *)
+
+val make : unit -> 'a t * 'a promise
+(** Make a new future with the associated promise. *)
+
+val make_promise : unit -> 'a promise
+(** Same as {!make} but returns a single promise (which can be upcast to a
+    future). This is useful mostly to preserve memory, you probably don't need
+    it.
+
+    How to upcast to a future in the worst case:
+    {[
+      let prom = Fut.make_promise ()
+      let fut = (prom : _ Fut.promise :> _ Fut.t)
+    ]}
+    @since 0.7 *)
+
+val on_result : 'a t -> ('a or_error -> unit) -> unit
+(** [on_result fut f] registers [f] to be called in the future when [fut] is
+    set; or calls [f] immediately if [fut] is already set.
+
+    {b NOTE:} it's ill advised to do meaningful work inside the callback [f].
+    Instead, try to spawn another task on the runner, or use {!await}. *)
+
+val on_result_ignore : _ t -> (Exn_bt.t option -> unit) -> unit
+(** [on_result_ignore fut f] registers [f] to be called in the future when [fut]
+    is set; or calls [f] immediately if [fut] is already set. It does not pass
+    the result, only a success/error signal.
+    @since 0.7 *)
+
+exception Already_fulfilled
+
+val try_cancel : _ promise -> Exn_bt.t -> bool
+(** [try_cancel promise ebt] tries to cancel the promise using the given
+    exception, returning [true]. It returns [false] if the promise is already
+    resolved.
+    @since 0.9 *)
+
+val cancel : _ promise -> Exn_bt.t -> unit
+(** Silent version of {!try_cancel}, ignoring the result.
+    @since 0.9 *)
+
+val fulfill : 'a promise -> 'a or_error -> unit
+(** Fullfill the promise, setting the future at the same time.
+    @raise Already_fulfilled if the promise is already fulfilled. *)
+
+val fulfill_idempotent : 'a promise -> 'a or_error -> unit
+(** Fullfill the promise, setting the future at the same time. Does nothing if
+    the promise is already fulfilled. *)
+
+val return : 'a -> 'a t
+(** Already settled future, with a result *)
+
+val fail : exn -> Printexc.raw_backtrace -> _ t
+(** Already settled future, with a failure *)
+
+val fail_exn_bt : Exn_bt.t -> _ t
+(** Fail from a bundle of exception and backtrace
+    @since 0.6 *)
+
+val of_result : 'a or_error -> 'a t
+(** Already resolved future from a result. *)
+
+val is_resolved : _ t -> bool
+(** [is_resolved fut] is [true] iff [fut] is resolved. *)
+
+val peek : 'a t -> 'a or_error option
+(** [peek fut] returns [Some r] if [fut] is currently resolved with [r], and
+    [None] if [fut] is not resolved yet. *)
+
+exception Not_ready
+(** @since 0.2 *)
+
+val get_or_fail : 'a t -> 'a or_error
+(** [get_or_fail fut] obtains the result from [fut] if it's fulfilled (i.e. if
+    [peek fut] returns [Some res], [get_or_fail fut] returns [res]).
+    @raise Not_ready if the future is not ready.
+    @since 0.2 *)
+
+val get_or_fail_exn : 'a t -> 'a
+(** [get_or_fail_exn fut] obtains the result from [fut] if it's fulfilled, like
+    {!get_or_fail}. If the result is an [Error _], the exception inside is
+    re-raised.
+    @raise Not_ready if the future is not ready.
+    @since 0.2 *)
+
+val is_done : _ t -> bool
+(** Is the future resolved? This is the same as [peek fut |> Option.is_some].
+    @since 0.2 *)
+
+val is_success : _ t -> bool
+(** Checks if the future is resolved with [Ok _] as a result.
+    @since 0.6 *)
+
+val is_failed : _ t -> bool
+(** Checks if the future is resolved with [Error _] as a result.
+    @since 0.6 *)
+
+val raise_if_failed : _ t -> unit
+(** [raise_if_failed fut] raises [e] if [fut] failed with [e].
+    @since 0.6 *)
+
+(** {2 Combinators} *)
+
+val spawn : on:Runner.t -> (unit -> 'a) -> 'a t
+(** [spaw ~on f] runs [f()] on the given runner [on], and return a future that
+    will hold its result. *)
+
+val spawn_on_current_runner : (unit -> 'a) -> 'a t
+(** This must be run from inside a runner, and schedules the new task on it as
+    well.
+
+    See {!Runner.get_current_runner} to see how the runner is found.
+
+    @since 0.5
+    @raise Failure if run from outside a runner. *)
+
+val reify_error : 'a t -> 'a or_error t
+(** [reify_error fut] turns a failing future into a non-failing one that contain
+    [Error (exn, bt)]. A non-failing future returning [x] is turned into [Ok x].
+    @since 0.4 *)
+
+val map : ?on:Runner.t -> f:('a -> 'b) -> 'a t -> 'b t
+(** [map ?on ~f fut] returns a new future [fut2] that resolves with [f x] if
+    [fut] resolved with [x]; and fails with [e] if [fut] fails with [e] or [f x]
+    raises [e].
+    @param on if provided, [f] runs on the given runner *)
+
+val bind : ?on:Runner.t -> f:('a -> 'b t) -> 'a t -> 'b t
+(** [bind ?on ~f fut] returns a new future [fut2] that resolves like the future
+    [f x] if [fut] resolved with [x]; and fails with [e] if [fut] fails with [e]
+    or [f x] raises [e].
+
+    This does not preserve local storage of [fut] inside [f].
+
+    @param on if provided, [f] runs on the given runner *)
+
+val bind_reify_error : ?on:Runner.t -> f:('a or_error -> 'b t) -> 'a t -> 'b t
+(** [bind_reify_error ?on ~f fut] returns a new future [fut2] that resolves like
+    the future [f (Ok x)] if [fut] resolved with [x]; and resolves like the
+    future [f (Error (exn, bt))] if [fut] fails with [exn] and backtrace [bt].
+
+    This does not preserve local storage of [fut] inside [f].
+
+    @param on if provided, [f] runs on the given runner
+    @since 0.4 *)
+
+val join : 'a t t -> 'a t
+(** [join fut] is [fut >>= Fun.id]. It joins the inner layer of the future.
+    @since 0.2 *)
+
+val both : 'a t -> 'b t -> ('a * 'b) t
+(** [both a b] succeeds with [x, y] if [a] succeeds with [x] and [b] succeeds
+    with [y], or fails if any of them fails. *)
+
+val choose : 'a t -> 'b t -> ('a, 'b) Either.t t
+(** [choose a b] succeeds [Left x] or [Right y] if [a] succeeds with [x] or [b]
+    succeeds with [y], or fails if both of them fails. If they both succeed, it
+    is not specified which result is used. *)
+
+val choose_same : 'a t -> 'a t -> 'a t
+(** [choose_same a b] succeeds with the value of one of [a] or [b] if they
+    succeed, or fails if both fail. If they both succeed, it is not specified
+    which result is used. *)
+
+val join_array : 'a t array -> 'a array t
+(** Wait for all the futures in the array. Fails if any future fails. *)
+
+val join_list : 'a t list -> 'a list t
+(** Wait for all the futures in the list. Fails if any future fails. *)
+
+(** Advanced primitives for synchronization *)
+module Advanced : sig
+  val barrier_on_abstract_container_of_futures :
+    iter:(('a t -> unit) -> 'cont -> unit) ->
+    len:('cont -> int) ->
+    aggregate_results:(('a t -> 'a) -> 'cont -> 'res) ->
+    'cont ->
+    'res t
+  (** [barrier_on_abstract_container_of_futures ~iter ~aggregate_results ~len
+       cont] takes a container of futures ([cont]), with [len] elements, and
+      returns a future result of type [res] (possibly another type of
+      container).
+
+      This waits for all futures in [cont: 'cont] to be done (futures obtained
+      via [iter <some function> cont]). If they all succeed, their results are
+      aggregated into a new result of type ['res] via
+      [aggregate_results <some function> cont].
+
+      {b NOTE}: the behavior is not specified if [iter f cont] (for a function
+      f) doesn't call [f] on exactly [len cont] elements.
+
+      @since 0.5.1 *)
+end
+
+val map_list : f:('a -> 'b t) -> 'a list -> 'b list t
+(** [map_list ~f l] is like [join_list @@ List.map f l].
+    @since 0.5.1 *)
+
+val wait_array : _ t array -> unit t
+(** [wait_array arr] waits for all futures in [arr] to resolve. It discards the
+    individual results of futures in [arr]. It fails if any future fails. *)
+
+val wait_list : _ t list -> unit t
+(** [wait_list l] waits for all futures in [l] to resolve. It discards the
+    individual results of futures in [l]. It fails if any future fails. *)
+
+val for_ : on:Runner.t -> int -> (int -> unit) -> unit t
+(** [for_ ~on n f] runs [f 0], [f 1], …, [f (n-1)] on the runner, and returns a
+    future that resolves when all the tasks have resolved, or fails as soon as
+    one task has failed. *)
+
+val for_array : on:Runner.t -> 'a array -> (int -> 'a -> unit) -> unit t
+(** [for_array ~on arr f] runs [f 0 arr.(0)], …, [f (n-1) arr.(n-1)] in the
+    runner (where [n = Array.length arr]), and returns a future that resolves
+    when all the tasks are done, or fails if any of them fails.
+    @since 0.2 *)
+
+val for_list : on:Runner.t -> 'a list -> ('a -> unit) -> unit t
+(** [for_list ~on l f] is like [for_array ~on (Array.of_list l) f].
+    @since 0.2 *)
+
+(** {2 Await}
+
+    This suspends the current task using an OCaml 5 algebraic effect, and makes
+    preparations for the task to be resumed once the future has been resolved.
+*)
+
+val await : 'a t -> 'a
+(** [await fut] suspends the current tasks until [fut] is fulfilled, then
+    resumes the task on this same runner (but possibly on a different
+    thread/domain).
+
+    @since 0.3
+
+    This must only be run from inside the runner itself. The runner must support
+    {!Suspend_}. *)
+
+val yield : unit -> unit
+(** Like {!Moonpool.yield}.
+    @since 0.10 *)
+
+(** {2 Blocking} *)
+
+val wait_block : 'a t -> 'a or_error
+(** [wait_block fut] blocks the current thread until [fut] is resolved, and
+    returns its value.
+
+    {b NOTE:} A word of warning: this will monopolize the calling thread until
+    the future resolves. This can also easily cause deadlocks, if enough threads
+    in a pool call [wait_block] on futures running on the same pool or a pool
+    depending on it.
+
+    A good rule to avoid deadlocks is to run this from outside of any pool, or
+    to have an acyclic order between pools where [wait_block] is only called
+    from a pool on futures evaluated in a pool that comes lower in the
+    hierarchy. If this rule is broken, it is possible for all threads in a pool
+    to wait for futures that can only make progress on these same threads, hence
+    the deadlock. *)
+
+val wait_block_exn : 'a t -> 'a
+(** Same as {!wait_block} but re-raises the exception if the future failed.
+
+    {b NOTE:} do check the cautionary note in {!wait_block} concerning
+    deadlocks. *)
+
+(** {2 Infix operators}
+
+    These combinators run on either the current pool (if present), or on the
+    same thread that just fulfilled the previous future if not.
+
+    They were previously present as [module Infix_local] and [val infix], but
+    are now simplified.
+
+    @since 0.5 *)
+
+(** @since 0.5 *)
+module Infix : sig
+  val ( >|= ) : 'a t -> ('a -> 'b) -> 'b t
+  val ( >>= ) : 'a t -> ('a -> 'b t) -> 'b t
+  val ( let+ ) : 'a t -> ('a -> 'b) -> 'b t
+  val ( and+ ) : 'a t -> 'b t -> ('a * 'b) t
+  val ( let* ) : 'a t -> ('a -> 'b t) -> 'b t
+  val ( and* ) : 'a t -> 'b t -> ('a * 'b) t
+end
+
+include module type of Infix
+
+module Infix_local = Infix
+[@@deprecated "Use Infix"]
+(** @deprecated use Infix instead *)
+
+(**/**)
+
+module Private_ : sig
+  val unsafe_promise_of_fut : 'a t -> 'a promise
+  (** Do not use unless you know exactly what you are doing. *)
+
+  val as_computation : 'a t -> 'a Picos.Computation.t
+  (** Picos compat *)
+end
+
+(**/**)
--- a/src/core/hmap_ls_.dummy.ml
+++ b/src/core/hmap_ls_.dummy.ml
@ -0,0 +1,7 @@
+(**/**)
+
+module Private_hmap_ls_ = struct
+  let copy_fls _ _ = ()
+end
+
+(**/**)
--- a/src/core/hmap_ls_.real.ml
+++ b/src/core/hmap_ls_.real.ml
@ -0,0 +1,68 @@
+open Types_
+
+open struct
+  module FLS = Picos.Fiber.FLS
+end
+
+(** A local hmap, inherited in children fibers *)
+let k_local_hmap : Hmap.t FLS.t = FLS.create ()
+
+(** Access the local [hmap], or an empty one if not set *)
+let[@inline] get_local_hmap () : Hmap.t =
+  match TLS.get_exn k_cur_fiber with
+  | exception TLS.Not_set -> Hmap.empty
+  | fiber -> FLS.get fiber ~default:Hmap.empty k_local_hmap
+
+let[@inline] set_local_hmap (h : Hmap.t) : unit =
+  match TLS.get_exn k_cur_fiber with
+  | exception TLS.Not_set -> ()
+  | fiber -> FLS.set fiber k_local_hmap h
+
+let[@inline] update_local_hmap (f : Hmap.t -> Hmap.t) : unit =
+  match TLS.get_exn k_cur_fiber with
+  | exception TLS.Not_set -> ()
+  | fiber ->
+    let h = FLS.get fiber ~default:Hmap.empty k_local_hmap in
+    let h = f h in
+    FLS.set fiber k_local_hmap h
+
+(** @raise Invalid_argument if not present *)
+let get_in_local_hmap_exn (k : 'a Hmap.key) : 'a =
+  let h = get_local_hmap () in
+  Hmap.get k h
+
+let get_in_local_hmap_opt (k : 'a Hmap.key) : 'a option =
+  let h = get_local_hmap () in
+  Hmap.find k h
+
+(** Remove given key from the local hmap *)
+let[@inline] remove_in_local_hmap (k : _ Hmap.key) : unit =
+  update_local_hmap (Hmap.rem k)
+
+let[@inline] set_in_local_hmap (k : 'a Hmap.key) (v : 'a) : unit =
+  update_local_hmap (Hmap.add k v)
+
+(** [with_in_local_hmap k v f] calls [f()] in a context where [k] is bound to
+    [v] in the local hmap. Then it restores the previous binding for [k]. *)
+let with_in_local_hmap (k : 'a Hmap.key) (v : 'a) f =
+  let h = get_local_hmap () in
+  match Hmap.find k h with
+  | None ->
+    set_in_local_hmap k v;
+    Fun.protect ~finally:(fun () -> remove_in_local_hmap k) f
+  | Some old_v ->
+    set_in_local_hmap k v;
+    Fun.protect ~finally:(fun () -> set_in_local_hmap k old_v) f
+
+(**/**)
+
+(* private functions, to be used by the rest of moonpool *)
+module Private_hmap_ls_ = struct
+  (** Copy the hmap from f1.fls to f2.fls *)
+  let copy_fls (f1 : Picos.Fiber.t) (f2 : Picos.Fiber.t) : unit =
+    match FLS.get_exn f1 k_local_hmap with
+    | exception FLS.Not_set -> ()
+    | hmap -> FLS.set f2 k_local_hmap hmap
+end
+
+(**/**)
--- a/src/core/lock.ml
+++ b/src/core/lock.ml
@ -0,0 +1,36 @@
+type 'a t = {
+  mutex: Mutex.t;
+  mutable content: 'a;
+}
+
+let create content : _ t = { mutex = Mutex.create (); content }
+
+let[@inline never] with_ (self : _ t) f =
+  Mutex.lock self.mutex;
+  match f self.content with
+  | x ->
+    Mutex.unlock self.mutex;
+    x
+  | exception e ->
+    Mutex.unlock self.mutex;
+    raise e
+
+let[@inline] mutex self = self.mutex
+let[@inline] update self f = with_ self (fun x -> self.content <- f x)
+
+let[@inline] update_map l f =
+  with_ l (fun x ->
+      let x', y = f x in
+      l.content <- x';
+      y)
+
+let[@inline never] get l =
+  Mutex.lock l.mutex;
+  let x = l.content in
+  Mutex.unlock l.mutex;
+  x
+
+let[@inline never] set l x =
+  Mutex.lock l.mutex;
+  l.content <- x;
+  Mutex.unlock l.mutex
--- a/src/core/lock.mli
+++ b/src/core/lock.mli
@ -0,0 +1,58 @@
+(** Mutex-protected resource.
+
+    This lock is a synchronous concurrency primitive, as a thin wrapper around
+    {!Mutex} that encourages proper management of the critical section in RAII
+    style:
+
+    {[
+      let (let@) = (@@)
+
+
+      …
+      let compute_foo =
+        (* enter critical section *)
+        let@ x = Lock.with_ protected_resource in
+        use_x;
+        return_foo ()
+        (* exit critical section *)
+      in
+      …
+    ]}
+
+    This lock does not work well with {!Fut.await}. A critical section that
+    contains a call to [await] might cause deadlocks, or lock starvation,
+    because it will hold onto the lock while it goes to sleep.
+
+    @since 0.3 *)
+
+type 'a t
+(** A value protected by a mutex *)
+
+val create : 'a -> 'a t
+(** Create a new protected value. *)
+
+val with_ : 'a t -> ('a -> 'b) -> 'b
+(** [with_ l f] runs [f x] where [x] is the value protected with the lock [l],
+    in a critical section. If [f x] fails, [with_lock l f] fails too but the
+    lock is released. *)
+
+val update : 'a t -> ('a -> 'a) -> unit
+(** [update l f] replaces the content [x] of [l] with [f x], while protected by
+    the mutex. *)
+
+val update_map : 'a t -> ('a -> 'a * 'b) -> 'b
+(** [update_map l f] computes [x', y = f (get l)], then puts [x'] in [l] and
+    returns [y], while protected by the mutex. *)
+
+val mutex : _ t -> Mutex.t
+(** Underlying mutex. *)
+
+val get : 'a t -> 'a
+(** Atomically get the value in the lock. The value that is returned isn't
+    protected! *)
+
+val set : 'a t -> 'a -> unit
+(** Atomically set the value.
+
+    {b NOTE} caution: using {!get} and {!set} as if this were a {!ref} is an
+    anti pattern and will not protect data against some race conditions. *)
--- a/src/core/main.ml
+++ b/src/core/main.ml
@ -0,0 +1,26 @@
+exception Oh_no of Exn_bt.t
+
+let main' ?(block_signals = false) () (f : Runner.t -> 'a) : 'a =
+  let worker_st =
+    Fifo_pool.Private_.create_single_threaded_state ~thread:(Thread.self ())
+      ~on_exn:(fun e bt -> raise (Oh_no (Exn_bt.make e bt)))
+      ()
+  in
+  let runner = Fifo_pool.Private_.runner_of_state worker_st in
+  try
+    let fut = Fut.spawn ~on:runner (fun () -> f runner) in
+    Fut.on_result fut (fun _ -> Runner.shutdown_without_waiting runner);
+
+    (* run the main thread *)
+    Worker_loop_.worker_loop worker_st
+      ~block_signals (* do not disturb existing thread *)
+      ~ops:Fifo_pool.Private_.worker_ops;
+
+    match Fut.peek fut with
+    | Some (Ok x) -> x
+    | Some (Error ebt) -> Exn_bt.raise ebt
+    | None -> assert false
+  with Oh_no ebt -> Exn_bt.raise ebt
+
+let main f =
+  main' () f ~block_signals:false (* do not disturb existing thread *)
--- a/src/core/main.mli
+++ b/src/core/main.mli
@ -0,0 +1,30 @@
+(** Main thread.
+
+    This is evolved from [Moonpool.Immediate_runner], but unlike it, this API
+    assumes you run it in a thread (possibly the main thread) which will block
+    until the initial computation is done.
+
+    This means it's reasonable to use [Main.main (fun () -> do_everything)] at
+    the beginning of the program. Other Moonpool pools can be created for
+    background tasks, etc. to do the heavy lifting, and the main thread (inside
+    this immediate runner) can coordinate tasks via [Fiber.await].
+
+    Aside from the fact that this blocks the caller thread, it is fairly similar
+    to {!Background_thread} in that there's a single worker to process
+    tasks/fibers.
+
+    This handles the concurency effects used in moonpool, including [await] and
+    [yield].
+
+    This module was migrated from the late [Moonpool_fib].
+
+    @since 0.10 *)
+
+val main : (Runner.t -> 'a) -> 'a
+(** [main f] runs [f()] in a scope that handles effects, including
+    {!Fiber.await}.
+
+    This scope can run background tasks as well, in a cooperative fashion. *)
+
+val main' : ?block_signals:bool -> unit -> (Runner.t -> 'a) -> 'a
+(** Same as {!main} but with room for optional arguments. *)
--- a/src/core/moonpool.ml
+++ b/src/core/moonpool.ml
@ -0,0 +1,45 @@
+open Types_
+
+exception Shutdown = Runner.Shutdown
+
+let start_thread_on_some_domain f x =
+  let did = Random.int (Domain_pool_.max_number_of_domains ()) in
+  Domain_pool_.run_on_and_wait did (fun () -> Thread.create f x)
+
+let run_async = Runner.run_async
+let run_wait_block = Runner.run_wait_block
+let get_current_runner = Runner.get_current_runner
+let recommended_thread_count () = Domain_.recommended_number ()
+let spawn = Fut.spawn
+let spawn_on_current_runner = Fut.spawn_on_current_runner
+let await = Fut.await
+let yield = Picos.Fiber.yield
+
+module Atomic = Atomic
+module Blocking_queue = Bb_queue
+module Background_thread = Background_thread
+module Chan = Chan
+module Exn_bt = Exn_bt
+module Fifo_pool = Fifo_pool
+module Fut = Fut
+module Lock = Lock
+module Main = Main
+module Immediate_runner = struct end
+module Runner = Runner
+module Task_local_storage = Task_local_storage
+module Thread_local_storage = Thread_local_storage
+module Trigger = Trigger
+module Ws_pool = Ws_pool
+
+(* re-export main *)
+include Main
+
+module Private = struct
+  module Ws_deque_ = Ws_deque_
+  module Worker_loop_ = Worker_loop_
+  module Domain_ = Domain_
+  module Tracing_ = Tracing_
+  module Types_ = Types_
+
+  let num_domains = Domain_pool_.max_number_of_domains
+end
--- a/src/core/moonpool.mli
+++ b/src/core/moonpool.mli
@ -0,0 +1,233 @@
+(** Moonpool
+
+    A pool within a bigger pool (ie the ocean). Here, we're talking about pools
+    of [Thread.t] that are dispatched over several [Domain.t] to enable
+    parallelism.
+
+    We provide several implementations of pools with distinct scheduling
+    strategies, alongside some concurrency primitives such as guarding locks
+    ({!Lock.t}) and futures ({!Fut.t}). *)
+
+module Ws_pool = Ws_pool
+module Fifo_pool = Fifo_pool
+module Background_thread = Background_thread
+module Runner = Runner
+module Trigger = Trigger
+module Main = Main
+
+module Immediate_runner : sig end
+[@@deprecated "use Moonpool_fib.Main"]
+(** Runner that runs tasks in the caller thread.
+
+    This is removed since 0.6, and replaced by {!Moonpool_fib.Main}. *)
+
+module Exn_bt = Exn_bt
+
+exception Shutdown
+(** Exception raised when trying to run tasks on runners that have been shut
+    down.
+    @since 0.6 *)
+
+val start_thread_on_some_domain : ('a -> unit) -> 'a -> Thread.t
+(** Similar to {!Thread.create}, but it picks a background domain at random to
+    run the thread. This ensures that we don't always pick the same domain to
+    run all the various threads needed in an application (timers, event loops,
+    etc.) *)
+
+val run_async : ?fiber:Picos.Fiber.t -> Runner.t -> (unit -> unit) -> unit
+(** [run_async runner task] schedules the task to run on the given runner. This
+    means [task()] will be executed at some point in the future, possibly in
+    another thread.
+    @param fiber optional initial (picos) fiber state
+    @since 0.5 *)
+
+val run_wait_block : ?fiber:Picos.Fiber.t -> Runner.t -> (unit -> 'a) -> 'a
+(** [run_wait_block runner f] schedules [f] for later execution on the runner,
+    like {!run_async}. It then blocks the current thread until [f()] is done
+    executing, and returns its result. If [f()] raises an exception, then
+    [run_wait_block pool f] will raise it as well.
+
+    See {!run_async} for more details.
+
+    {b NOTE} be careful with deadlocks (see notes in {!Fut.wait_block} about the
+    required discipline to avoid deadlocks).
+    @raise Shutdown if the runner was already shut down
+    @since 0.6 *)
+
+val recommended_thread_count : unit -> int
+(** Number of threads recommended to saturate the CPU. For IO pools this makes
+    little sense (you might want more threads than this because many of them
+    will be blocked most of the time).
+    @since 0.5 *)
+
+val spawn : on:Runner.t -> (unit -> 'a) -> 'a Fut.t
+(** [spawn ~on f] runs [f()] on the runner (a thread pool typically) and returns
+    a future result for it. See {!Fut.spawn}.
+    @since 0.5 *)
+
+val spawn_on_current_runner : (unit -> 'a) -> 'a Fut.t
+(** See {!Fut.spawn_on_current_runner}.
+    @since 0.5 *)
+
+val get_current_runner : unit -> Runner.t option
+(** See {!Runner.get_current_runner}
+    @since 0.7 *)
+
+val await : 'a Fut.t -> 'a
+(** Await a future, must be run on a moonpool runner. See {!Fut.await}. Only on
+    OCaml >= 5.0.
+    @since 0.5 *)
+
+val yield : unit -> unit
+(** Yield from the current task, must be run on a moonpool runner. Only on OCaml
+    >= 5.0.
+    @since 0.9 *)
+
+module Lock = Lock
+module Fut = Fut
+module Chan = Chan
+module Task_local_storage = Task_local_storage
+module Thread_local_storage = Thread_local_storage
+
+(** A simple blocking queue.
+
+    This queue is quite basic and will not behave well under heavy contention.
+    However, it can be sufficient for many practical use cases.
+
+    {b NOTE}: this queue will typically block the caller thread in case the
+    operation (push/pop) cannot proceed. Be wary of deadlocks when using the
+    queue {i from} a pool when you expect the other end to also be
+    produced/consumed from the same pool.
+
+    See discussion on {!Fut.wait_block} for more details on deadlocks and how to
+    mitigate the risk of running into them.
+
+    More scalable queues can be found in Lockfree
+    (https://github.com/ocaml-multicore/lockfree/) *)
+module Blocking_queue : sig
+  type 'a t
+  (** Unbounded blocking queue.
+
+      This queue is thread-safe and will block when calling {!pop} on it when
+      it's empty. *)
+
+  val create : unit -> _ t
+  (** Create a new unbounded queue. *)
+
+  val size : _ t -> int
+  (** Number of items currently in the queue. Note that [pop] might still block
+      if this returns a non-zero number, since another thread might have
+      consumed the items in the mean time.
+      @since 0.2 *)
+
+  exception Closed
+
+  val push : 'a t -> 'a -> unit
+  (** [push q x] pushes [x] into [q], and returns [()].
+
+      In the current implementation, [push q] will never block for a long time,
+      it will only block while waiting for a lock so it can push the element.
+      @raise Closed if the queue is closed (by a previous call to [close q]) *)
+
+  val pop : 'a t -> 'a
+  (** [pop q] pops the next element in [q]. It might block until an element
+      comes.
+      @raise Closed if the queue was closed before a new element was available.
+  *)
+
+  val close : _ t -> unit
+  (** Close the queue, meaning there won't be any more [push] allowed, ie [push]
+      will raise {!Closed}.
+
+      [pop] will keep working and will return the elements present in the queue,
+      until it's entirely drained; then [pop] will also raise {!Closed}. *)
+
+  val try_pop : force_lock:bool -> 'a t -> 'a option
+  (** [try_pop q] immediately pops the first element of [q], if any, or returns
+      [None] without blocking.
+      @param force_lock
+        if true, use {!Mutex.lock} (which can block under contention); if false,
+        use {!Mutex.try_lock}, which might return [None] even in presence of an
+        element if there's contention *)
+
+  val try_push : 'a t -> 'a -> bool
+  (** [try_push q x] tries to push into [q], in which case it returns [true]; or
+      it fails to push and returns [false] without blocking.
+      @raise Closed if the locking succeeded but the queue is closed. *)
+
+  val transfer : 'a t -> 'a Queue.t -> unit
+  (** [transfer bq q2] transfers all items presently in [bq] into [q2] in one
+      atomic section, and clears [bq]. It blocks if no element is in [bq].
+
+      This is useful to consume elements from the queue in batch. Create a
+      [Queue.t] locally:
+
+      {[
+        let dowork (work_queue : job Bb_queue.t) =
+          (* local queue, not thread safe *)
+          let local_q = Queue.create () in
+          try
+            while true do
+              (* work on local events, already on this thread *)
+              while not (Queue.is_empty local_q) do
+                let job = Queue.pop local_q in
+                process_job job
+              done;
+
+              (* get all the events in the incoming blocking queue, in
+                 one single critical section. *)
+              Bb_queue.transfer work_queue local_q
+            done
+          with Bb_queue.Closed -> ()
+      ]}
+
+      @since 0.4 *)
+
+  type 'a gen = unit -> 'a option
+  type 'a iter = ('a -> unit) -> unit
+
+  val to_iter : 'a t -> 'a iter
+  (** [to_iter q] returns an iterator over all items in the queue. This might
+      not terminate if [q] is never closed.
+      @since 0.4 *)
+
+  val to_gen : 'a t -> 'a gen
+  (** [to_gen q] returns a generator from the queue.
+      @since 0.4 *)
+
+  val to_seq : 'a t -> 'a Seq.t
+  (** [to_gen q] returns a (transient) sequence from the queue.
+      @since 0.4 *)
+end
+
+module Atomic = Atomic
+(** Atomic values.
+
+    This is either a shim using [ref], on pre-OCaml 5, or the standard [Atomic]
+    module on OCaml 5. *)
+
+include module type of struct
+  include Main
+end
+
+(**/**)
+
+(** Private internals, with no stability guarantees *)
+module Private : sig
+  module Ws_deque_ = Ws_deque_
+  (** A deque for work stealing, fixed size. *)
+
+  module Worker_loop_ = Worker_loop_
+  (** Worker loop. This is useful to implement custom runners, it should run on
+      each thread of the runner.
+      @since 0.7 *)
+
+  module Domain_ = Domain_
+  (** Utils for domains *)
+
+  module Tracing_ = Tracing_
+  module Types_ = Types_
+
+  val num_domains : unit -> int
+  (** Number of domains in the backing domain pool *)
+end
--- a/src/core/runner.ml
+++ b/src/core/runner.ml
@ -0,0 +1,63 @@
+open Types_
+
+type fiber = Picos.Fiber.t
+type task = unit -> unit
+
+type t = runner = {
+  run_async: fiber:fiber -> task -> unit;
+  shutdown: wait:bool -> unit -> unit;
+  size: unit -> int;
+  num_tasks: unit -> int;
+}
+
+exception Shutdown
+
+let[@inline] run_async ?fiber (self : t) f : unit =
+  let fiber =
+    match fiber with
+    | Some f -> f
+    | None ->
+      let comp = Picos.Computation.create () in
+      Picos.Fiber.create ~forbid:false comp
+  in
+  self.run_async ~fiber f
+
+let[@inline] shutdown (self : t) : unit = self.shutdown ~wait:true ()
+
+let[@inline] shutdown_without_waiting (self : t) : unit =
+  self.shutdown ~wait:false ()
+
+let[@inline] num_tasks (self : t) : int = self.num_tasks ()
+let[@inline] size (self : t) : int = self.size ()
+
+let run_wait_block ?fiber self (f : unit -> 'a) : 'a =
+  let q = Bb_queue.create () in
+  run_async ?fiber self (fun () ->
+      try
+        let x = f () in
+        Bb_queue.push q (Ok x)
+      with exn ->
+        let bt = Printexc.get_raw_backtrace () in
+        Bb_queue.push q (Error (exn, bt)));
+  match Bb_queue.pop q with
+  | Ok x -> x
+  | Error (exn, bt) -> Printexc.raise_with_backtrace exn bt
+
+module For_runner_implementors = struct
+  let create ~size ~num_tasks ~shutdown ~run_async () : t =
+    { size; num_tasks; shutdown; run_async }
+
+  let k_cur_runner : t TLS.t = Types_.k_cur_runner
+end
+
+let dummy : t =
+  For_runner_implementors.create
+    ~size:(fun () -> 0)
+    ~num_tasks:(fun () -> 0)
+    ~shutdown:(fun ~wait:_ () -> ())
+    ~run_async:(fun ~fiber:_ _ ->
+      failwith "Runner.dummy: cannot actually run tasks")
+    ()
+
+let get_current_runner = get_current_runner
+let get_current_fiber = get_current_fiber
--- a/src/core/runner.mli
+++ b/src/core/runner.mli
@ -0,0 +1,88 @@
+(** Interface for runners.
+
+    This provides an abstraction for running tasks in the background, which is
+    implemented by various thread pools.
+    @since 0.3 *)
+
+type fiber = Picos.Fiber.t
+type task = unit -> unit
+
+type t
+(** A runner.
+
+    If a runner is no longer needed, {!shutdown} can be used to signal all
+    worker threads in it to stop (after they finish their work), and wait for
+    them to stop.
+
+    The threads are distributed across a fixed domain pool (whose size is
+    determined by {!Domain.recommended_domain_count} on OCaml 5, and simple the
+    single runtime on OCaml 4). *)
+
+val size : t -> int
+(** Number of threads/workers. *)
+
+val num_tasks : t -> int
+(** Current number of tasks. This is at best a snapshot, useful for metrics and
+    debugging. *)
+
+val shutdown : t -> unit
+(** Shutdown the runner and wait for it to terminate. Idempotent. *)
+
+val shutdown_without_waiting : t -> unit
+(** Shutdown the pool, and do not wait for it to terminate. Idempotent. *)
+
+exception Shutdown
+
+val run_async : ?fiber:fiber -> t -> task -> unit
+(** [run_async pool f] schedules [f] for later execution on the runner in one of
+    the threads. [f()] will run on one of the runner's worker threads/domains.
+    @param fiber if provided, run the task with this initial fiber data
+    @raise Shutdown if the runner was shut down before [run_async] was called.
+*)
+
+val run_wait_block : ?fiber:fiber -> t -> (unit -> 'a) -> 'a
+(** [run_wait_block pool f] schedules [f] for later execution on the pool, like
+    {!run_async}. It then blocks the current thread until [f()] is done
+    executing, and returns its result. If [f()] raises an exception, then
+    [run_wait_block pool f] will raise it as well.
+
+    {b NOTE} be careful with deadlocks (see notes in {!Fut.wait_block} about the
+    required discipline to avoid deadlocks).
+    @raise Shutdown if the runner was already shut down *)
+
+val dummy : t
+(** Runner that fails when scheduling tasks on it. Calling {!run_async} on it
+    will raise Failure.
+    @since 0.6 *)
+
+(** {2 Implementing runners} *)
+
+(** This module is specifically intended for users who implement their own
+    runners. Regular users of Moonpool should not need to look at it. *)
+module For_runner_implementors : sig
+  val create :
+    size:(unit -> int) ->
+    num_tasks:(unit -> int) ->
+    shutdown:(wait:bool -> unit -> unit) ->
+    run_async:(fiber:fiber -> task -> unit) ->
+    unit ->
+    t
+  (** Create a new runner.
+
+      {b NOTE}: the runner should support DLA and {!Suspend_} on OCaml 5.x, so
+      that {!Fork_join} and other 5.x features work properly. *)
+
+  val k_cur_runner : t Thread_local_storage.t
+  (** Key that should be used by each runner to store itself in TLS on every
+      thread it controls, so that tasks running on these threads can access the
+      runner. This is necessary for {!get_current_runner} to work. *)
+end
+
+val get_current_runner : unit -> t option
+(** Access the current runner. This returns [Some r] if the call happens on a
+    thread that belongs in a runner.
+    @since 0.5 *)
+
+val get_current_fiber : unit -> fiber option
+(** [get_current_storage runner] gets the local storage for the currently
+    running task. *)
--- a/src/core/task_local_storage.ml
+++ b/src/core/task_local_storage.ml
@ -0,0 +1,44 @@
+open Types_
+module PF = Picos.Fiber
+
+type 'a t = 'a PF.FLS.t
+
+exception Not_set = PF.FLS.Not_set
+
+let create = PF.FLS.create
+
+let[@inline] get_exn k =
+  let fiber = get_current_fiber_exn () in
+  PF.FLS.get_exn fiber k
+
+let get_opt k =
+  match get_current_fiber () with
+  | None -> None
+  | Some fiber ->
+    (match PF.FLS.get_exn fiber k with
+    | x -> Some x
+    | exception Not_set -> None)
+
+let[@inline] get k ~default =
+  match get_current_fiber () with
+  | None -> default
+  | Some fiber -> PF.FLS.get fiber ~default k
+
+let[@inline] set k v : unit =
+  let fiber = get_current_fiber_exn () in
+  PF.FLS.set fiber k v
+
+let with_value k v (f : _ -> 'b) : 'b =
+  let fiber = get_current_fiber_exn () in
+
+  match PF.FLS.get_exn fiber k with
+  | exception Not_set ->
+    PF.FLS.set fiber k v;
+    (* nothing to restore back to, just call [f] *)
+    f ()
+  | old_v ->
+    PF.FLS.set fiber k v;
+    let finally () = PF.FLS.set fiber k old_v in
+    Fun.protect f ~finally
+
+include Hmap_ls_
--- a/src/core/task_local_storage.mli
+++ b/src/core/task_local_storage.mli
@ -0,0 +1,43 @@
+(** Task-local storage.
+
+    This storage is associated to the current task, just like thread-local
+    storage is associated with the current thread. The storage is carried along
+    in case the current task is suspended.
+
+    @since 0.6 *)
+
+type 'a t = 'a Picos.Fiber.FLS.t
+
+val create : unit -> 'a t
+(** [create ()] makes a new key. Keys are expensive and should never be
+    allocated dynamically or in a loop. *)
+
+exception Not_set
+
+val get_exn : 'a t -> 'a
+(** [get k] gets the value for the current task for key [k]. Must be run from
+    inside a task running on a runner.
+    @raise Not_set otherwise *)
+
+val get_opt : 'a t -> 'a option
+(** [get_opt k] gets the current task's value for key [k], or [None] if not run
+    from inside the task. *)
+
+val get : 'a t -> default:'a -> 'a
+
+val set : 'a t -> 'a -> unit
+(** [set k v] sets the storage for [k] to [v]. Must be run from inside a task
+    running on a runner.
+    @raise Failure otherwise *)
+
+val with_value : 'a t -> 'a -> (unit -> 'b) -> 'b
+(** [with_value k v f] sets [k] to [v] for the duration of the call to [f()].
+    When [f()] returns (or fails), [k] is restored to its old value. *)
+
+(** {2 Local [Hmap.t]}
+
+    This requires [hmap] to be installed. *)
+
+include module type of struct
+  include Hmap_ls_
+end
--- a/src/core/trigger.ml
+++ b/src/core/trigger.ml
@ -0,0 +1,6 @@
+(** Triggers from picos
+    @since 0.7 *)
+
+include Picos.Trigger
+
+let[@inline] await_exn (self : t) = await self |> Option.iter Exn_bt.raise
--- a/src/core/types_.ml
+++ b/src/core/types_.ml
@ -0,0 +1,38 @@
+module TLS = Thread_local_storage
+module Domain_pool_ = Moonpool_dpool
+
+type task = unit -> unit
+type fiber = Picos.Fiber.t
+
+type runner = {
+  run_async: fiber:fiber -> task -> unit;
+  shutdown: wait:bool -> unit -> unit;
+  size: unit -> int;
+  num_tasks: unit -> int;
+}
+
+let k_cur_runner : runner TLS.t = TLS.create ()
+let k_cur_fiber : fiber TLS.t = TLS.create ()
+
+let _dummy_computation : Picos.Computation.packed =
+  let c = Picos.Computation.create () in
+  Picos.Computation.cancel c (Failure "dummy fiber") (Printexc.get_callstack 0);
+  Picos.Computation.Packed c
+
+let _dummy_fiber = Picos.Fiber.create_packed ~forbid:true _dummy_computation
+let[@inline] get_current_runner () : _ option = TLS.get_opt k_cur_runner
+
+let[@inline] get_current_fiber () : fiber option =
+  match TLS.get_exn k_cur_fiber with
+  | f when f != _dummy_fiber -> Some f
+  | _ -> None
+  | exception TLS.Not_set -> None
+
+let error_get_current_fiber_ =
+  "Moonpool: get_current_fiber was called outside of a fiber."
+
+let[@inline] get_current_fiber_exn () : fiber =
+  match TLS.get_exn k_cur_fiber with
+  | f when f != _dummy_fiber -> f
+  | _ -> failwith error_get_current_fiber_
+  | exception TLS.Not_set -> failwith error_get_current_fiber_
--- a/src/core/util_pool_.ml
+++ b/src/core/util_pool_.ml
@ -0,0 +1,11 @@
+let num_threads ?num_threads () : int =
+  let n_domains = Moonpool_dpool.max_number_of_domains () in
+
+  (* number of threads to run *)
+  let num_threads =
+    match num_threads with
+    | Some j -> max 1 j
+    | None -> n_domains
+  in
+
+  num_threads
--- a/src/core/util_pool_.mli
+++ b/src/core/util_pool_.mli
@ -0,0 +1,5 @@
+(** Utils for pools *)
+
+val num_threads : ?num_threads:int -> unit -> int
+(** Number of threads a pool should have.
+    @param num_threads user-specified number of threads *)
--- a/src/core/worker_loop_.ml
+++ b/src/core/worker_loop_.ml
@ -0,0 +1,192 @@
+open Types_
+
+type fiber = Picos.Fiber.t
+
+type task_full =
+  | T_start of {
+      fiber: fiber;
+      f: unit -> unit;
+    }
+  | T_resume : {
+      fiber: fiber;
+      k: unit -> unit;
+    }
+      -> task_full
+
+exception No_more_tasks
+
+type 'st ops = {
+  schedule: 'st -> task_full -> unit;
+  get_next_task: 'st -> task_full;  (** @raise No_more_tasks *)
+  on_exn: 'st -> Exn_bt.t -> unit;
+  runner: 'st -> Runner.t;
+  before_start: 'st -> unit;
+  cleanup: 'st -> unit;
+}
+
+(** A dummy task. *)
+let _dummy_task : task_full = T_start { f = ignore; fiber = _dummy_fiber }
+
+let[@inline] discontinue k exn =
+  let bt = Printexc.get_raw_backtrace () in
+  Effect.Deep.discontinue_with_backtrace k exn bt
+
+let[@inline] raise_with_bt exn =
+  let bt = Printexc.get_raw_backtrace () in
+  Printexc.raise_with_backtrace exn bt
+
+let with_handler (type st) ~(ops : st ops) (self : st) : (unit -> unit) -> unit
+    =
+  let current =
+    Some
+      (fun k ->
+        match get_current_fiber_exn () with
+        | fiber -> Effect.Deep.continue k fiber
+        | exception exn -> discontinue k exn)
+  and yield =
+    Some
+      (fun k ->
+        let fiber = get_current_fiber_exn () in
+        match
+          let k () = Effect.Deep.continue k () in
+          ops.schedule self @@ T_resume { fiber; k }
+        with
+        | () -> ()
+        | exception exn -> discontinue k exn)
+  and reschedule trigger fiber k : unit =
+    ignore (Picos.Fiber.unsuspend fiber trigger : bool);
+    let k () = Picos.Fiber.resume fiber k in
+    let task = T_resume { fiber; k } in
+    ops.schedule self task
+  in
+  let effc (type a) :
+      a Effect.t -> ((a, _) Effect.Deep.continuation -> _) option = function
+    | Picos.Fiber.Current -> current
+    | Picos.Fiber.Yield -> yield
+    | Picos.Fiber.Spawn r ->
+      Some
+        (fun k ->
+          match
+            let f () = r.main r.fiber in
+            let task = T_start { fiber = r.fiber; f } in
+            ops.schedule self task
+          with
+          | unit -> Effect.Deep.continue k unit
+          | exception exn -> discontinue k exn)
+    | Picos.Trigger.Await trigger ->
+      Some
+        (fun k ->
+          let fiber = get_current_fiber_exn () in
+          (* when triggers is signaled, reschedule task *)
+          if not (Picos.Fiber.try_suspend fiber trigger fiber k reschedule) then
+            (* trigger was already signaled, reschedule task now *)
+            reschedule trigger fiber k)
+    | Picos.Computation.Cancel_after _r ->
+      Some
+        (fun k ->
+          (* not implemented *)
+          let exn = Failure "Moonpool: cancel_after is not supported." in
+          discontinue k exn)
+    | _ -> None
+  in
+  let handler = Effect.Deep.{ retc = Fun.id; exnc = raise_with_bt; effc } in
+  fun f -> Effect.Deep.match_with f () handler
+
+module type FINE_GRAINED_ARGS = sig
+  type st
+
+  val ops : st ops
+  val st : st
+end
+
+module Fine_grained (Args : FINE_GRAINED_ARGS) () = struct
+  open Args
+
+  let cur_fiber : fiber ref = ref _dummy_fiber
+  let runner = ops.runner st
+
+  type state =
+    | New
+    | Ready
+    | Torn_down
+
+  let state = ref New
+
+  let run_task (task : task_full) : unit =
+    let fiber =
+      match task with
+      | T_start { fiber; _ } | T_resume { fiber; _ } -> fiber
+    in
+
+    cur_fiber := fiber;
+    TLS.set k_cur_fiber fiber;
+
+    (* let _ctx = before_task runner in *)
+
+    (* run the task now, catching errors, handling effects *)
+    assert (task != _dummy_task);
+    (try
+       match task with
+       | T_start { fiber = _; f } -> with_handler ~ops st f
+       | T_resume { fiber = _; k } ->
+         (* this is already in an effect handler *)
+         k ()
+     with e ->
+       let bt = Printexc.get_raw_backtrace () in
+       let ebt = Exn_bt.make e bt in
+       ops.on_exn st ebt);
+
+    (* after_task runner _ctx; *)
+    cur_fiber := _dummy_fiber;
+    TLS.set k_cur_fiber _dummy_fiber
+
+  let setup ~block_signals () : unit =
+    if !state <> New then invalid_arg "worker_loop.setup: not a new instance";
+    state := Ready;
+
+    if block_signals then Signals_.ignore_signals_ ();
+
+    TLS.set Runner.For_runner_implementors.k_cur_runner runner;
+
+    ops.before_start st
+
+  let run ?(max_tasks = max_int) () : unit =
+    if !state <> Ready then invalid_arg "worker_loop.run: not setup";
+
+    let continue = ref true in
+    let n_tasks = ref 0 in
+    while !continue && !n_tasks < max_tasks do
+      match ops.get_next_task st with
+      | task ->
+        incr n_tasks;
+        run_task task
+      | exception No_more_tasks -> continue := false
+    done
+
+  let teardown () =
+    if !state <> Torn_down then (
+      state := Torn_down;
+      cur_fiber := _dummy_fiber;
+      ops.cleanup st
+    )
+end
+
+let worker_loop (type st) ~block_signals ~(ops : st ops) (self : st) : unit =
+  let module FG =
+    Fine_grained
+      (struct
+        type nonrec st = st
+
+        let ops = ops
+        let st = self
+      end)
+      ()
+  in
+  FG.setup ~block_signals ();
+  try
+    FG.run ();
+    FG.teardown ()
+  with exn ->
+    let bt = Printexc.get_raw_backtrace () in
+    FG.teardown ();
+    Printexc.raise_with_backtrace exn bt
--- a/src/core/worker_loop_.mli
+++ b/src/core/worker_loop_.mli
@ -0,0 +1,51 @@
+(** Internal module that is used for workers.
+
+    A thread pool should use this [worker_loop] to run tasks, handle effects,
+    etc. *)
+
+open Types_
+
+type task_full =
+  | T_start of {
+      fiber: fiber;
+      f: unit -> unit;
+    }
+  | T_resume : {
+      fiber: fiber;
+      k: unit -> unit;
+    }
+      -> task_full
+
+val _dummy_task : task_full
+
+exception No_more_tasks
+
+type 'st ops = {
+  schedule: 'st -> task_full -> unit;
+  get_next_task: 'st -> task_full;
+  on_exn: 'st -> Exn_bt.t -> unit;
+  runner: 'st -> Runner.t;
+  before_start: 'st -> unit;
+  cleanup: 'st -> unit;
+}
+
+module type FINE_GRAINED_ARGS = sig
+  type st
+
+  val ops : st ops
+  val st : st
+end
+
+module Fine_grained (_ : FINE_GRAINED_ARGS) () : sig
+  val setup : block_signals:bool -> unit -> unit
+  (** Just initialize the loop *)
+
+  val run : ?max_tasks:int -> unit -> unit
+  (** Run the loop until no task remains or until [max_tasks] tasks have been
+      run *)
+
+  val teardown : unit -> unit
+  (** Tear down the loop *)
+end
+
+val worker_loop : block_signals:bool -> ops:'st ops -> 'st -> unit
--- a/src/core/ws_pool.ml
+++ b/src/core/ws_pool.ml
@ -0,0 +1,320 @@
+open Types_
+module A = Atomic
+module WSQ = Ws_deque_
+module WL = Worker_loop_
+include Runner
+
+let ( let@ ) = ( @@ )
+
+module Id = struct
+  type t = unit ref
+  (** Unique identifier for a pool *)
+
+  let create () : t = Sys.opaque_identity (ref ())
+  let equal : t -> t -> bool = ( == )
+end
+
+type state = {
+  id_: Id.t;
+      (** Unique to this pool. Used to make sure tasks stay within the same
+          pool. *)
+  active: bool A.t;  (** Becomes [false] when the pool is shutdown. *)
+  mutable workers: worker_state array;  (** Fixed set of workers. *)
+  main_q: WL.task_full Queue.t;
+      (** Main queue for tasks coming from the outside *)
+  mutable n_waiting: int; (* protected by mutex *)
+  mutable n_waiting_nonzero: bool;  (** [n_waiting > 0] *)
+  mutex: Mutex.t;
+  cond: Condition.t;
+  mutable as_runner: t;
+  (* init options *)
+  name: string option;
+  on_init_thread: dom_id:int -> t_id:int -> unit -> unit;
+  on_exit_thread: dom_id:int -> t_id:int -> unit -> unit;
+  on_exn: exn -> Printexc.raw_backtrace -> unit;
+}
+(** internal state *)
+
+and worker_state = {
+  mutable thread: Thread.t;
+  idx: int;
+  dom_id: int;
+  st: state;
+  q: WL.task_full WSQ.t;  (** Work stealing queue *)
+  rng: Random.State.t;
+}
+(** State for a given worker. Only this worker is allowed to push into the
+    queue, but other workers can come and steal from it if they're idle. *)
+
+let[@inline] size_ (self : state) = Array.length self.workers
+
+let num_tasks_ (self : state) : int =
+  let n = ref 0 in
+  n := Queue.length self.main_q;
+  Array.iter (fun w -> n := !n + WSQ.size w.q) self.workers;
+  !n
+
+(** TLS, used by worker to store their specific state and be able to retrieve it
+    from tasks when we schedule new sub-tasks. *)
+let k_worker_state : worker_state TLS.t = TLS.create ()
+
+let[@inline] get_current_worker_ () : worker_state option =
+  TLS.get_opt k_worker_state
+
+(** Try to wake up a waiter, if there's any. *)
+let[@inline] try_wake_someone_ (self : state) : unit =
+  if self.n_waiting_nonzero then (
+    Mutex.lock self.mutex;
+    Condition.signal self.cond;
+    Mutex.unlock self.mutex
+  )
+
+(** Push into worker's local queue, open to work stealing. precondition: this
+    runs on the worker thread whose state is [self] *)
+let schedule_on_current_worker (self : worker_state) task : unit =
+  (* we're on this same pool, schedule in the worker's state. Otherwise
+     we might also be on pool A but asking to schedule on pool B,
+     so we have to check that identifiers match. *)
+  let pushed = WSQ.push self.q task in
+  if pushed then
+    try_wake_someone_ self.st
+  else (
+    (* overflow into main queue *)
+    Mutex.lock self.st.mutex;
+    Queue.push task self.st.main_q;
+    if self.st.n_waiting_nonzero then Condition.signal self.st.cond;
+    Mutex.unlock self.st.mutex
+  )
+
+(** Push into the shared queue of this pool *)
+let schedule_in_main_queue (self : state) task : unit =
+  if A.get self.active then (
+    (* push into the main queue *)
+    Mutex.lock self.mutex;
+    Queue.push task self.main_q;
+    if self.n_waiting_nonzero then Condition.signal self.cond;
+    Mutex.unlock self.mutex
+  ) else
+    (* notify the caller that scheduling tasks is no
+       longer permitted *)
+    raise Shutdown
+
+let schedule_from_w (self : worker_state) (task : WL.task_full) : unit =
+  match get_current_worker_ () with
+  | Some w when Id.equal self.st.id_ w.st.id_ ->
+    (* use worker from the same pool *)
+    schedule_on_current_worker w task
+  | _ -> schedule_in_main_queue self.st task
+
+exception Got_task of WL.task_full
+
+(** Try to steal a task.
+    @raise Got_task if it finds one. *)
+let try_to_steal_work_once_ (self : worker_state) : unit =
+  let init = Random.State.int self.rng (Array.length self.st.workers) in
+  for i = 0 to Array.length self.st.workers - 1 do
+    let w' =
+      Array.unsafe_get self.st.workers
+        ((i + init) mod Array.length self.st.workers)
+    in
+
+    if self != w' then (
+      match WSQ.steal w'.q with
+      | Some t -> raise_notrace (Got_task t)
+      | None -> ()
+    )
+  done
+
+(** Wait on condition. Precondition: we hold the mutex. *)
+let[@inline] wait_for_condition_ (self : state) : unit =
+  self.n_waiting <- self.n_waiting + 1;
+  if self.n_waiting = 1 then self.n_waiting_nonzero <- true;
+  Condition.wait self.cond self.mutex;
+  self.n_waiting <- self.n_waiting - 1;
+  if self.n_waiting = 0 then self.n_waiting_nonzero <- false
+
+let rec get_next_task (self : worker_state) : WL.task_full =
+  (* see if we can empty the local queue *)
+  match WSQ.pop_exn self.q with
+  | task ->
+    try_wake_someone_ self.st;
+    task
+  | exception WSQ.Empty -> try_to_steal_from_other_workers_ self
+
+and try_to_steal_from_other_workers_ (self : worker_state) =
+  match try_to_steal_work_once_ self with
+  | exception Got_task task -> task
+  | () -> wait_on_main_queue self
+
+and wait_on_main_queue (self : worker_state) : WL.task_full =
+  Mutex.lock self.st.mutex;
+  match Queue.pop self.st.main_q with
+  | task ->
+    Mutex.unlock self.st.mutex;
+    task
+  | exception Queue.Empty ->
+    (* wait here *)
+    if A.get self.st.active then (
+      wait_for_condition_ self.st;
+
+      (* see if a task became available *)
+      match Queue.pop self.st.main_q with
+      | task ->
+        Mutex.unlock self.st.mutex;
+        task
+      | exception Queue.Empty ->
+        Mutex.unlock self.st.mutex;
+        try_to_steal_from_other_workers_ self
+    ) else (
+      (* do nothing more: no task in main queue, and we are shutting
+         down so no new task should arrive.
+         The exception is if another task is creating subtasks
+         that overflow into the main queue, but we can ignore that at
+         the price of slightly decreased performance for the last few
+         tasks *)
+      Mutex.unlock self.st.mutex;
+      raise WL.No_more_tasks
+    )
+
+let before_start (self : worker_state) : unit =
+  let t_id = Thread.id @@ Thread.self () in
+  self.st.on_init_thread ~dom_id:self.dom_id ~t_id ();
+  TLS.set k_cur_fiber _dummy_fiber;
+  TLS.set Runner.For_runner_implementors.k_cur_runner self.st.as_runner;
+  TLS.set k_worker_state self;
+
+  (* set thread name *)
+  Option.iter
+    (fun name ->
+      Tracing_.set_thread_name (Printf.sprintf "%s.worker.%d" name self.idx))
+    self.st.name
+
+let cleanup (self : worker_state) : unit =
+  (* on termination, decrease refcount of underlying domain *)
+  Domain_pool_.decr_on self.dom_id;
+  let t_id = Thread.id @@ Thread.self () in
+  self.st.on_exit_thread ~dom_id:self.dom_id ~t_id ()
+
+let worker_ops : worker_state WL.ops =
+  let runner (st : worker_state) = st.st.as_runner in
+  let on_exn (st : worker_state) (ebt : Exn_bt.t) =
+    st.st.on_exn (Exn_bt.exn ebt) (Exn_bt.bt ebt)
+  in
+  {
+    WL.schedule = schedule_from_w;
+    runner;
+    get_next_task;
+    on_exn;
+    before_start;
+    cleanup;
+  }
+
+let default_thread_init_exit_ ~dom_id:_ ~t_id:_ () = ()
+
+let shutdown_ ~wait (self : state) : unit =
+  if A.exchange self.active false then (
+    Mutex.lock self.mutex;
+    Condition.broadcast self.cond;
+    Mutex.unlock self.mutex;
+    if wait then Array.iter (fun w -> Thread.join w.thread) self.workers
+  )
+
+let as_runner_ (self : state) : t =
+  Runner.For_runner_implementors.create
+    ~shutdown:(fun ~wait () -> shutdown_ self ~wait)
+    ~run_async:(fun ~fiber f ->
+      schedule_in_main_queue self @@ T_start { fiber; f })
+    ~size:(fun () -> size_ self)
+    ~num_tasks:(fun () -> num_tasks_ self)
+    ()
+
+type ('a, 'b) create_args =
+  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+  ?num_threads:int ->
+  ?name:string ->
+  'a
+(** Arguments used in {!create}. See {!create} for explanations. *)
+
+let create ?(on_init_thread = default_thread_init_exit_)
+    ?(on_exit_thread = default_thread_init_exit_) ?(on_exn = fun _ _ -> ())
+    ?num_threads ?name () : t =
+  let pool_id_ = Id.create () in
+  let num_domains = Domain_pool_.max_number_of_domains () in
+  let num_threads = Util_pool_.num_threads ?num_threads () in
+
+  (* make sure we don't bias towards the first domain(s) in {!D_pool_} *)
+  let offset = Random.int num_domains in
+
+  let pool =
+    {
+      id_ = pool_id_;
+      active = A.make true;
+      workers = [||];
+      main_q = Queue.create ();
+      n_waiting = 0;
+      n_waiting_nonzero = true;
+      mutex = Mutex.create ();
+      cond = Condition.create ();
+      on_exn;
+      on_init_thread;
+      on_exit_thread;
+      name;
+      as_runner = Runner.dummy;
+    }
+  in
+  pool.as_runner <- as_runner_ pool;
+
+  (* temporary queue used to obtain thread handles from domains
+     on which the thread are started. *)
+  let receive_threads = Bb_queue.create () in
+
+  (* start the thread with index [i] *)
+  let create_worker_state idx =
+    let dom_id = (offset + idx) mod num_domains in
+    {
+      st = pool;
+      thread = (* dummy *) Thread.self ();
+      q = WSQ.create ~dummy:WL._dummy_task ();
+      rng = Random.State.make [| idx |];
+      dom_id;
+      idx;
+    }
+  in
+
+  pool.workers <- Array.init num_threads create_worker_state;
+
+  (* start the thread with index [i] *)
+  let start_thread_with_idx idx (st : worker_state) =
+    (* function called in domain with index [i], to
+       create the thread and push it into [receive_threads] *)
+    let create_thread_in_domain () =
+      let thread =
+        Thread.create (WL.worker_loop ~block_signals:true ~ops:worker_ops) st
+      in
+      (* send the thread from the domain back to us *)
+      Bb_queue.push receive_threads (idx, thread)
+    in
+    Domain_pool_.run_on st.dom_id create_thread_in_domain
+  in
+
+  (* start all worker threads, placing them on the domains
+     according to their index and [offset] in a round-robin fashion. *)
+  Array.iteri start_thread_with_idx pool.workers;
+
+  (* receive the newly created threads back from domains *)
+  for _j = 1 to num_threads do
+    let i, th = Bb_queue.pop receive_threads in
+    let worker_state = pool.workers.(i) in
+    worker_state.thread <- th
+  done;
+
+  pool.as_runner
+
+let with_ ?on_init_thread ?on_exit_thread ?on_exn ?num_threads ?name () f =
+  let pool =
+    create ?on_init_thread ?on_exit_thread ?on_exn ?num_threads ?name ()
+  in
+  let@ () = Fun.protect ~finally:(fun () -> shutdown pool) in
+  f pool
--- a/src/core/ws_pool.mli
+++ b/src/core/ws_pool.mli
@ -0,0 +1,57 @@
+(** Work-stealing thread pool.
+
+    A pool of threads with a worker-stealing scheduler. The pool contains a
+    fixed number of worker threads that wait for work items to come, process
+    these, and loop.
+
+    This is good for CPU-intensive tasks that feature a lot of small tasks. Note
+    that tasks will not always be processed in the order they are scheduled, so
+    this is not great for workloads where the latency of individual tasks matter
+    (for that see {!Fifo_pool}).
+
+    This implements {!Runner.t} since 0.3.
+
+    If a pool is no longer needed, {!shutdown} can be used to signal all threads
+    in it to stop (after they finish their work), and wait for them to stop.
+
+    The threads are distributed across a fixed domain pool (whose size is
+    determined by {!Domain.recommended_domain_count}. See {!create} for more
+    details. *)
+
+include module type of Runner
+
+type ('a, 'b) create_args =
+  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
+  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
+  ?num_threads:int ->
+  ?name:string ->
+  'a
+(** Arguments used in {!create}. See {!create} for explanations. *)
+
+val create : (unit -> t, _) create_args
+(** [create ()] makes a new thread pool.
+    @param on_init_thread
+      called at the beginning of each new thread in the pool.
+    @param num_threads
+      size of the pool, ie. number of worker threads. It will be at least [1]
+      internally, so [0] or negative values make no sense. The default is
+      [Domain.recommended_domain_count()], ie one worker thread per CPU core.
+
+    Note that specifying [num_threads=n] means that the degree of parallelism is
+    at most [n]. This behavior is different than the one of [Domainslib], see
+    https://github.com/c-cube/moonpool/issues/41 for context.
+
+    If you want to use all cores, use [Domain.recommended_domain_count()].
+
+    @param on_exit_thread called at the end of each thread in the pool
+    @param name
+      a name for this thread pool, used if tracing is enabled (since 0.6) *)
+
+val with_ : (unit -> (t -> 'a) -> 'a, _) create_args
+(** [with_ () f] calls [f pool], where [pool] is obtained via {!create}. When
+    [f pool] returns or fails, [pool] is shutdown and its resources are
+    released.
+
+    Most parameters are the same as in {!create}.
+    @since 0.3 *)
--- a/src/d_pool_.ml
+++ b/src/d_pool_.ml
@ -1,35 +0,0 @@
-type domain = Domain_.t
-
-let work_ _i q : unit =
-  while true do
-    let f = Bb_queue.pop q in
-    try f () with _ -> ()
-  done
-
-(* A domain level worker. It should not do too much except for starting
-    new threads for pools. *)
-type worker = { q: (unit -> unit) Bb_queue.t } [@@unboxed]
-
-let domains_ : worker array lazy_t =
-  lazy
-    ((* number of domains we spawn. Note that we spawn n-1 domains
-        because there already is the main domain running. *)
-     let n = max 1 (Domain_.recommended_number () - 1) in
-     Array.init n (fun i ->
-         let q = Bb_queue.create () in
-         let _domain : domain = Domain_.spawn (fun () -> work_ i q) in
-         { q }))
-
-let[@inline] n_domains () : int = Array.length (Lazy.force domains_)
-
-let run_on (i : int) (f : unit -> unit) : unit =
-  let (lazy arr) = domains_ in
-  assert (i < Array.length arr);
-  Bb_queue.push arr.(i).q f
-
-let run_on_and_wait (i : int) (f : unit -> 'a) : 'a =
-  let q = Bb_queue.create () in
-  run_on i (fun () ->
-      let x = f () in
-      Bb_queue.push q x);
-  Bb_queue.pop q
--- a/src/d_pool_.mli
+++ b/src/d_pool_.mli
@ -1,20 +0,0 @@
-(** Static pool of domains.
-
-   These domains are shared between {b all} the pools in moonpool.
-   The rationale is that we should not have more domains than cores, so
-   it's easier to pre-allocate exactly that many domains, and run more flexible
-   thread pools on top.
-*)
-
-type domain = Domain_.t
-
-val n_domains : unit -> int
-(** Number of domains in the pool *)
-
-val run_on : int -> (unit -> unit) -> unit
-(** [run_on i f] runs [f()] on the domain with index [i].
-    Precondition: [0 <= i < n_domains()] *)
-
-val run_on_and_wait : int -> (unit -> 'a) -> 'a
-(** [run_on_and_wait i f] runs [f()] on the domain with index [i],
-    and blocks until the result of [f()] is returned back. *)
--- a/src/dpool/dune
+++ b/src/dpool/dune
@ -0,0 +1,6 @@
+(library
+ (name moonpool_dpool)
+ (public_name moonpool.dpool)
+ (synopsis "Moonpool's domain pool (used to start worker threads)")
+ (flags :standard -open Moonpool_private)
+ (libraries moonpool.private))
--- a/src/dpool/moonpool_dpool.ml
+++ b/src/dpool/moonpool_dpool.ml
@ -0,0 +1,188 @@
+module Bb_queue = struct
+  type 'a t = {
+    mutex: Mutex.t;
+    cond: Condition.t;
+    q: 'a Queue.t;
+  }
+
+  let create () : _ t =
+    { mutex = Mutex.create (); cond = Condition.create (); q = Queue.create () }
+
+  let push (self : _ t) x : unit =
+    Mutex.lock self.mutex;
+    let was_empty = Queue.is_empty self.q in
+    Queue.push x self.q;
+    if was_empty then Condition.broadcast self.cond;
+    Mutex.unlock self.mutex
+
+  let pop (type a) (self : a t) : a =
+    let module M = struct
+      exception Found of a
+    end in
+    try
+      Mutex.lock self.mutex;
+      while true do
+        if Queue.is_empty self.q then
+          Condition.wait self.cond self.mutex
+        else (
+          let x = Queue.pop self.q in
+          Mutex.unlock self.mutex;
+          raise (M.Found x)
+        )
+      done;
+      assert false
+    with M.Found x -> x
+end
+
+module Lock = struct
+  type 'a t = {
+    mutex: Mutex.t;
+    mutable content: 'a;
+  }
+
+  let create content : _ t = { mutex = Mutex.create (); content }
+
+  let[@inline never] with_ (self : _ t) f =
+    Mutex.lock self.mutex;
+    match f self.content with
+    | x ->
+      Mutex.unlock self.mutex;
+      x
+    | exception e ->
+      Mutex.unlock self.mutex;
+      raise e
+
+  let[@inline] update_map l f =
+    with_ l (fun x ->
+        let x', y = f x in
+        l.content <- x';
+        y)
+
+  let get l =
+    Mutex.lock l.mutex;
+    let x = l.content in
+    Mutex.unlock l.mutex;
+    x
+end
+
+type domain = Domain_.t
+
+type event =
+  | Run of (unit -> unit)  (** Run this function *)
+  | Decr  (** Decrease count *)
+
+(* State for a domain worker. It should not do too much except for starting
+    new threads for pools. *)
+type worker_state = {
+  q: event Bb_queue.t;
+  th_count: int Atomic.t;  (** Number of threads on this *)
+}
+
+(** Array of (optional) workers.
+
+    Workers are started/stop on demand. For each index we have the (currently
+    active) domain's state including a work queue and a thread refcount; and the
+    domain itself, if any, in a separate option because it might outlive its own
+    state. *)
+let domains_ : (worker_state option * Domain_.t option) Lock.t array =
+  let n = max 1 (Domain_.recommended_number ()) in
+  Array.init n (fun _ -> Lock.create (None, None))
+
+(** main work loop for a domain worker.
+
+    A domain worker does two things:
+    - run functions it's asked to (mainly, to start new threads inside it)
+    - decrease the refcount when one of these threads stops. The thread will
+      notify the domain that it's exiting, so the domain can know how many
+      threads are still using it. If all threads exit, the domain polls a bit
+      (in case new threads are created really shortly after, which happens with
+      a [Pool.with_] or [Pool.create() … Pool.shutdown()] in a tight loop), and
+      if nothing happens it tries to stop to free resources. *)
+let work_ idx (st : worker_state) : unit =
+  Signals_.ignore_signals_ ();
+  let main_loop () =
+    let continue = ref true in
+    while !continue do
+      match Bb_queue.pop st.q with
+      | Run f -> (try f () with _ -> ())
+      | Decr ->
+        if Atomic.fetch_and_add st.th_count (-1) = 1 then (
+          continue := false;
+
+          (* wait a bit, we might be needed again in a short amount of time *)
+          try
+            for _n_attempt = 1 to 50 do
+              Thread.delay 0.001;
+              if Atomic.get st.th_count > 0 then (
+                (* needed again! *)
+                continue := true;
+                raise Exit
+              )
+            done
+          with Exit -> ()
+        )
+    done
+  in
+
+  while
+    main_loop ();
+
+    (* exit: try to remove ourselves from [domains]. If that fails, keep living. *)
+    let is_alive =
+      Lock.update_map domains_.(idx) (function
+        | None, _ -> assert false
+        | Some _st', dom ->
+          assert (st == _st');
+
+          if Atomic.get st.th_count > 0 then
+            (* still alive! *)
+            (Some st, dom), true
+          else
+            (None, dom), false)
+    in
+
+    is_alive
+  do
+    ()
+  done;
+  ()
+
+(* special case for main domain: we start a worker immediately *)
+let () =
+  assert (Domain_.is_main_domain ());
+  let w = { th_count = Atomic.make 1; q = Bb_queue.create () } in
+  (* thread that stays alive since [th_count>0] will always hold *)
+  ignore (Thread.create (fun () -> work_ 0 w) () : Thread.t);
+  domains_.(0) <- Lock.create (Some w, None)
+
+let[@inline] max_number_of_domains () : int = Array.length domains_
+
+let run_on (i : int) (f : unit -> unit) : unit =
+  assert (i < Array.length domains_);
+
+  let w : worker_state =
+    Lock.update_map domains_.(i) (function
+      | (Some w, _) as st ->
+        Atomic.incr w.th_count;
+        st, w
+      | None, dying_dom ->
+        (* join previous dying domain, to free its resources, if any *)
+        Option.iter Domain_.join dying_dom;
+        let w = { th_count = Atomic.make 1; q = Bb_queue.create () } in
+        let worker : domain = Domain_.spawn (fun () -> work_ i w) in
+        (Some w, Some worker), w)
+  in
+  Bb_queue.push w.q (Run f)
+
+let decr_on (i : int) : unit =
+  assert (i < Array.length domains_);
+  match Lock.get domains_.(i) with
+  | Some st, _ -> Bb_queue.push st.q Decr
+  | None, _ -> ()
+
+let run_on_and_wait (i : int) (f : unit -> 'a) : 'a =
+  let q = Bb_queue.create () in
+  run_on i (fun () ->
+      let x = f () in
+      Bb_queue.push q x);
+  Bb_queue.pop q
--- a/src/dpool/moonpool_dpool.mli
+++ b/src/dpool/moonpool_dpool.mli
@ -0,0 +1,35 @@
+(** Static pool of domains.
+
+    These domains are shared between {b all} the pools in moonpool. The
+    rationale is that we should not have more domains than cores, so it's easier
+    to reserve exactly that many domain slots, and run more flexible thread
+    pools on top (each domain being shared by potentially multiple threads from
+    multiple pools).
+
+    The pool should not contain actual domains if it's not in use, ie if no
+    runner is presently actively using one or more of the domain slots.
+
+    {b NOTE}: Interface is still experimental.
+
+    @since 0.6 *)
+
+type domain = Domain_.t
+
+val max_number_of_domains : unit -> int
+(** Number of domains in the pool when all domains are active. *)
+
+(** {2 Low level interface for resouce handling}
+
+    Be very cautious with this interface, or resource leaks might occur. *)
+
+val run_on : int -> (unit -> unit) -> unit
+(** [run_on i f] runs [f()] on the domain with index [i]. Precondition:
+    [0 <= i < n_domains()]. The thread must call {!decr_on} with [i] once it's
+    done. *)
+
+val decr_on : int -> unit
+(** Signal that a thread is stopping on the domain with index [i]. *)
+
+val run_on_and_wait : int -> (unit -> 'a) -> 'a
+(** [run_on_and_wait i f] runs [f()] on the domain with index [i], and blocks
+    until the result of [f()] is returned back. *)
--- a/src/dune
+++ b/src/dune
@ -1,18 +0,0 @@
-(library
- (public_name moonpool)
- (name moonpool)
- (private_modules d_pool_)
- (libraries threads either))
-
-(rule
-  (targets atomic_.ml)
-  (action
-    (with-stdout-to %{targets}
-     (run ./gen/gen.exe --ocaml %{ocaml_version} --atomic))))
-          
-(rule
-  (targets domain_.ml)
-  (action
-    (with-stdout-to %{targets}
-     (run ./gen/gen.exe --ocaml %{ocaml_version} --domain))))
-          
--- a/src/forkjoin/dune
+++ b/src/forkjoin/dune
@ -0,0 +1,7 @@
+(library
+ (name moonpool_forkjoin)
+ (public_name moonpool.forkjoin)
+ (synopsis "Fork-join parallelism for moonpool")
+ (flags :standard -open Moonpool)
+ (optional)
+ (libraries moonpool moonpool.private picos))
--- a/src/forkjoin/moonpool_forkjoin.ml
+++ b/src/forkjoin/moonpool_forkjoin.ml
@ -0,0 +1,218 @@
+module A = Moonpool.Atomic
+module Domain_ = Moonpool_private.Domain_
+
+module State_ = struct
+  type error = exn * Printexc.raw_backtrace
+  type 'a or_error = ('a, error) result
+
+  type ('a, 'b) t =
+    | Init
+    | Left_solved of 'a or_error
+    | Right_solved of 'b or_error * Trigger.t
+    | Both_solved of 'a or_error * 'b or_error
+
+  let get_exn_ (self : _ t A.t) =
+    match A.get self with
+    | Both_solved (Ok a, Ok b) -> a, b
+    | Both_solved (Error (exn, bt), _) | Both_solved (_, Error (exn, bt)) ->
+      Printexc.raise_with_backtrace exn bt
+    | _ -> assert false
+
+  let rec set_left_ (self : _ t A.t) (left : _ or_error) =
+    let old_st = A.get self in
+    match old_st with
+    | Init ->
+      let new_st = Left_solved left in
+      if not (A.compare_and_set self old_st new_st) then (
+        Domain_.relax ();
+        set_left_ self left
+      )
+    | Right_solved (right, tr) ->
+      let new_st = Both_solved (left, right) in
+      if not (A.compare_and_set self old_st new_st) then (
+        Domain_.relax ();
+        set_left_ self left
+      ) else
+        Trigger.signal tr
+    | Left_solved _ | Both_solved _ -> assert false
+
+  let rec set_right_ (self : _ t A.t) (right : _ or_error) : unit =
+    let old_st = A.get self in
+    match old_st with
+    | Left_solved left ->
+      let new_st = Both_solved (left, right) in
+      if not (A.compare_and_set self old_st new_st) then set_right_ self right
+    | Init ->
+      (* we are first arrived, we suspend until the left computation is done *)
+      let trigger = Trigger.create () in
+      let must_await = ref true in
+
+      while
+        let old_st = A.get self in
+        match old_st with
+        | Init ->
+          (* setup trigger so that left computation will wake us up *)
+          not (A.compare_and_set self old_st (Right_solved (right, trigger)))
+        | Left_solved left ->
+          (* other thread is done, no risk of race condition *)
+          A.set self (Both_solved (left, right));
+          must_await := false;
+          false
+        | Right_solved _ | Both_solved _ -> assert false
+      do
+        ()
+      done;
+
+      (* wait for the other computation to be done *)
+      if !must_await then Trigger.await_exn trigger
+    | Right_solved _ | Both_solved _ -> assert false
+end
+
+let both f g : _ * _ =
+  let module ST = State_ in
+  let st = A.make ST.Init in
+
+  let runner =
+    match Runner.get_current_runner () with
+    | None -> invalid_arg "Fork_join.both must be run from within a runner"
+    | Some r -> r
+  in
+
+  (* start computing [f] in the background *)
+  Runner.run_async runner (fun () ->
+      try
+        let res = f () in
+        ST.set_left_ st (Ok res)
+      with exn ->
+        let bt = Printexc.get_raw_backtrace () in
+        ST.set_left_ st (Error (exn, bt)));
+
+  let res_right =
+    try Ok (g ())
+    with exn ->
+      let bt = Printexc.get_raw_backtrace () in
+      Error (exn, bt)
+  in
+
+  ST.set_right_ st res_right;
+  ST.get_exn_ st
+
+let both_ignore f g = ignore (both f g : _ * _)
+
+let for_ ?chunk_size n (f : int -> int -> unit) : unit =
+  if n > 0 then (
+    let runner =
+      match Runner.get_current_runner () with
+      | None -> failwith "forkjoin.for_: must be run inside a moonpool runner."
+      | Some r -> r
+    in
+    let failure = A.make None in
+    let missing = A.make n in
+
+    let chunk_size =
+      match chunk_size with
+      | Some cs -> max 1 (min n cs)
+      | None ->
+        (* guess: try to have roughly one task per core *)
+        max 1 (1 + (n / Moonpool.Private.num_domains ()))
+    in
+
+    let trigger = Trigger.create () in
+
+    let task_for ~offset ~len_range =
+      match f offset (offset + len_range - 1) with
+      | () ->
+        if A.fetch_and_add missing (-len_range) = len_range then
+          (* all tasks done successfully *)
+          Trigger.signal trigger
+      | exception exn ->
+        let bt = Printexc.get_raw_backtrace () in
+        if Option.is_none (A.exchange failure (Some (Exn_bt.make exn bt))) then
+          (* first one to fail, and [missing] must be >= 2
+             because we're not decreasing it. *)
+          Trigger.signal trigger
+    in
+
+    let i = ref 0 in
+    while !i < n do
+      let offset = !i in
+
+      let len_range = min chunk_size (n - offset) in
+      assert (offset + len_range <= n);
+
+      Runner.run_async runner (fun () -> task_for ~offset ~len_range);
+      i := !i + len_range
+    done;
+
+    Trigger.await_exn trigger;
+    Option.iter Exn_bt.raise @@ A.get failure;
+    ()
+  )
+
+let all_array ?chunk_size (fs : _ array) : _ array =
+  let len = Array.length fs in
+  let arr = Array.make len None in
+
+  (* parallel for *)
+  for_ ?chunk_size len (fun low high ->
+      for i = low to high do
+        let x = fs.(i) () in
+        arr.(i) <- Some x
+      done);
+
+  (* get all results *)
+  Array.map
+    (function
+      | None -> assert false
+      | Some x -> x)
+    arr
+
+let all_list ?chunk_size fs : _ list =
+  Array.to_list @@ all_array ?chunk_size @@ Array.of_list fs
+
+let all_init ?chunk_size n f : _ list =
+  let arr = Array.make n None in
+
+  for_ ?chunk_size n (fun low high ->
+      for i = low to high do
+        let x = f i in
+        arr.(i) <- Some x
+      done);
+
+  (* get all results *)
+  List.init n (fun i ->
+      match arr.(i) with
+      | None -> assert false
+      | Some x -> x)
+
+let map_array ?chunk_size f arr : _ array =
+  let n = Array.length arr in
+  let res = Array.make n None in
+
+  for_ ?chunk_size n (fun low high ->
+      for i = low to high do
+        res.(i) <- Some (f arr.(i))
+      done);
+
+  (* get all results *)
+  Array.map
+    (function
+      | None -> assert false
+      | Some x -> x)
+    res
+
+let map_list ?chunk_size f (l : _ list) : _ list =
+  let arr = Array.of_list l in
+  let n = Array.length arr in
+  let res = Array.make n None in
+
+  for_ ?chunk_size n (fun low high ->
+      for i = low to high do
+        res.(i) <- Some (f arr.(i))
+      done);
+
+  (* get all results *)
+  List.init n (fun i ->
+      match res.(i) with
+      | None -> assert false
+      | Some x -> x)
--- a/src/forkjoin/moonpool_forkjoin.mli
+++ b/src/forkjoin/moonpool_forkjoin.mli
@ -0,0 +1,113 @@
+(** Fork-join primitives.
+
+    {b NOTE} These are only available on OCaml 5.0 and above.
+
+    @since 0.3 *)
+
+val both : (unit -> 'a) -> (unit -> 'b) -> 'a * 'b
+(** [both f g] runs [f()] and [g()], potentially in parallel, and returns their
+    result when both are done. If any of [f()] and [g()] fails, then the whole
+    computation fails.
+
+    This must be run from within the pool: for example, inside {!Pool.run} or
+    inside a {!Fut.spawn} computation. This is because it relies on an effect
+    handler to be installed.
+
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
+
+val both_ignore : (unit -> _) -> (unit -> _) -> unit
+(** Same as [both f g |> ignore].
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
+
+val for_ : ?chunk_size:int -> int -> (int -> int -> unit) -> unit
+(** [for_ n f] is the parallel version of [for i=0 to n-1 do f i done].
+
+    [f] is called with parameters [low] and [high] and must use them like so:
+    {[ for j = low to high do (* … actual work *) done ]}.
+    If [chunk_size=1] then [low=high] and the loop is not actually needed.
+
+    @param chunk_size controls the granularity of parallelism.
+      The default chunk size is not specified.
+    See {!all_array} or {!all_list} for more details.
+
+    Example:
+    {[
+      let total_sum = Atomic.make 0
+
+      let() = for_ ~chunk_size:5 100
+        (fun low high ->
+          (* iterate on the range sequentially. The range should have 5 items or less. *)
+          let local_sum = ref 0 in
+          for j=low to high do
+            local_sum := !local_sum + j
+          done;
+          ignore (Atomic.fetch_and_add total_sum !local_sum : int)))
+
+      let() = assert (Atomic.get total_sum = 4950)
+    ]}
+
+    Note how we still compute a local sum sequentially in [(fun low high -> …)],
+    before combining it wholesale into [total_sum]. When the chunk size is large,
+    this can have a dramatic impact on the synchronization overhead.
+
+    When [chunk_size] is not provided, the library will attempt to guess a value
+    that keeps all cores busy but runs as few tasks as possible to reduce
+    the synchronization overhead.
+
+    Use [~chunk_size:1] if you explicitly want to
+    run each iteration of the loop in its own task.
+
+    @since 0.3
+    {b NOTE} this is only available on OCaml 5. *)
+
+val all_array : ?chunk_size:int -> (unit -> 'a) array -> 'a array
+(** [all_array fs] runs all functions in [fs] in tasks, and waits for all the
+    results.
+
+    @param chunk_size
+      if equal to [n], groups items by [n] to be run in a single task. Default
+      is [1].
+
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
+
+val all_list : ?chunk_size:int -> (unit -> 'a) list -> 'a list
+(** [all_list fs] runs all functions in [fs] in tasks, and waits for all the
+    results.
+
+    @param chunk_size
+      if equal to [n], groups items by [n] to be run in a single task. Default
+      is not specified. This parameter is available since 0.3.
+
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
+
+val all_init : ?chunk_size:int -> int -> (int -> 'a) -> 'a list
+(** [all_init n f] runs functions [f 0], [f 1], … [f (n-1)] in tasks, and waits
+    for all the results.
+
+    @param chunk_size
+      if equal to [n], groups items by [n] to be run in a single task. Default
+      is not specified. This parameter is available since 0.3.
+
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
+
+val map_array : ?chunk_size:int -> ('a -> 'b) -> 'a array -> 'b array
+(** [map_array f arr] is like [Array.map f arr], but runs in parallel.
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
+
+val map_list : ?chunk_size:int -> ('a -> 'b) -> 'a list -> 'b list
+(** [map_list f l] is like [List.map f l], but runs in parallel.
+    @since 0.3
+
+    {b NOTE} this is only available on OCaml 5. *)
--- a/src/fut.ml
+++ b/src/fut.ml
@ -1,348 +0,0 @@
-module A = Atomic_
-
-type 'a or_error = ('a, exn * Printexc.raw_backtrace) result
-type 'a waiter = 'a or_error -> unit
-
-type 'a state =
-  | Done of 'a or_error
-  | Waiting of 'a waiter list
-
-type 'a t = { st: 'a state A.t } [@@unboxed]
-type 'a promise = 'a t
-
-let make () =
-  let fut = { st = A.make (Waiting []) } in
-  fut, fut
-
-let of_result x : _ t = { st = A.make (Done x) }
-let[@inline] return x : _ t = of_result (Ok x)
-let[@inline] fail e bt : _ t = of_result (Error (e, bt))
-
-let[@inline] is_resolved self : bool =
-  match A.get self.st with
-  | Done _ -> true
-  | Waiting _ -> false
-
-let[@inline] peek self : _ option =
-  match A.get self.st with
-  | Done x -> Some x
-  | Waiting _ -> None
-
-let on_result (self : _ t) (f : _ waiter) : unit =
-  while
-    let st = A.get self.st in
-    match st with
-    | Done x ->
-      f x;
-      false
-    | Waiting l ->
-      let must_retry = not (A.compare_and_set self.st st (Waiting (f :: l))) in
-      must_retry
-  do
-    Domain_.relax ()
-  done
-
-exception Already_fulfilled
-
-let fulfill (self : _ t) (r : _ result) : unit =
-  while
-    let st = A.get self.st in
-    match st with
-    | Done _ -> raise Already_fulfilled
-    | Waiting l ->
-      let did_swap = A.compare_and_set self.st st (Done r) in
-      if did_swap then (
-        (* success, now call all the waiters *)
-        List.iter (fun f -> try f r with _ -> ()) l;
-        false
-      ) else
-        true
-  do
-    Domain_.relax ()
-  done
-
-let[@inline] fulfill_idempotent self r =
-  try fulfill self r with Already_fulfilled -> ()
-
-(* ### combinators ### *)
-
-let spawn ~on f : _ t =
-  let fut, promise = make () in
-
-  let task () =
-    let res =
-      try Ok (f ())
-      with e ->
-        let bt = Printexc.get_raw_backtrace () in
-        Error (e, bt)
-    in
-    fulfill promise res
-  in
-
-  Pool.run on task;
-  fut
-
-let map ?on ~f fut : _ t =
-  let map_res r =
-    match r with
-    | Ok x ->
-      (try Ok (f x)
-       with e ->
-         let bt = Printexc.get_raw_backtrace () in
-         Error (e, bt))
-    | Error e_bt -> Error e_bt
-  in
-
-  match peek fut with
-  | Some r -> of_result (map_res r)
-  | None ->
-    let fut2, promise = make () in
-    on_result fut (fun r ->
-        let map_and_fulfill () =
-          let res = map_res r in
-          fulfill promise res
-        in
-
-        match on with
-        | None -> map_and_fulfill ()
-        | Some on -> Pool.run on map_and_fulfill);
-
-    fut2
-
-let bind ?on ~f fut : _ t =
-  let apply_f_to_res r : _ t =
-    match r with
-    | Ok x ->
-      (try f x
-       with e ->
-         let bt = Printexc.get_raw_backtrace () in
-         fail e bt)
-    | Error (e, bt) -> fail e bt
-  in
-
-  match peek fut with
-  | Some r -> apply_f_to_res r
-  | None ->
-    let fut2, promise = make () in
-    on_result fut (fun r ->
-        let bind_and_fulfill () =
-          let f_res_fut = apply_f_to_res r in
-          (* forward result *)
-          on_result f_res_fut (fun r -> fulfill promise r)
-        in
-
-        match on with
-        | None -> bind_and_fulfill ()
-        | Some on -> Pool.run on bind_and_fulfill);
-
-    fut2
-
-let update_ (st : 'a A.t) f : 'a =
-  let rec loop () =
-    let x = A.get st in
-    let y = f x in
-    if A.compare_and_set st x y then
-      y
-    else (
-      Domain_.relax ();
-      loop ()
-    )
-  in
-  loop ()
-
-let both a b : _ t =
-  match peek a, peek b with
-  | Some (Ok x), Some (Ok y) -> return (x, y)
-  | Some (Error (e, bt)), _ | _, Some (Error (e, bt)) -> fail e bt
-  | _ ->
-    let fut, promise = make () in
-
-    let st = A.make `Neither in
-    on_result a (function
-      | Error err -> fulfill_idempotent promise (Error err)
-      | Ok x ->
-        (match
-           update_ st (function
-             | `Neither -> `Left x
-             | `Right y -> `Both (x, y)
-             | _ -> assert false)
-         with
-        | `Both (x, y) -> fulfill promise (Ok (x, y))
-        | _ -> ()));
-    on_result b (function
-      | Error err -> fulfill_idempotent promise (Error err)
-      | Ok y ->
-        (match
-           update_ st (function
-             | `Left x -> `Both (x, y)
-             | `Neither -> `Right y
-             | _ -> assert false)
-         with
-        | `Both (x, y) -> fulfill promise (Ok (x, y))
-        | _ -> ()));
-    fut
-
-let choose a b : _ t =
-  match peek a, peek b with
-  | Some (Ok x), _ -> return (Either.Left x)
-  | _, Some (Ok y) -> return (Either.Right y)
-  | Some (Error (e, bt)), Some (Error _) -> fail e bt
-  | _ ->
-    let fut, promise = make () in
-
-    let one_failure = A.make false in
-    on_result a (function
-      | Error err ->
-        if A.exchange one_failure true then
-          (* the other one failed already *)
-          fulfill_idempotent promise (Error err)
-      | Ok x -> fulfill_idempotent promise (Ok (Either.Left x)));
-    on_result b (function
-      | Error err ->
-        if A.exchange one_failure true then
-          (* the other one failed already *)
-          fulfill_idempotent promise (Error err)
-      | Ok y -> fulfill_idempotent promise (Ok (Either.Right y)));
-    fut
-
-let choose_same a b : _ t =
-  match peek a, peek b with
-  | Some (Ok x), _ -> return x
-  | _, Some (Ok y) -> return y
-  | Some (Error (e, bt)), Some (Error _) -> fail e bt
-  | _ ->
-    let fut, promise = make () in
-
-    let one_failure = A.make false in
-    on_result a (function
-      | Error err ->
-        if A.exchange one_failure true then
-          fulfill_idempotent promise (Error err)
-      | Ok x -> fulfill_idempotent promise (Ok x));
-    on_result b (function
-      | Error err ->
-        if A.exchange one_failure true then
-          fulfill_idempotent promise (Error err)
-      | Ok y -> fulfill_idempotent promise (Ok y));
-    fut
-
-let peek_ok_assert_ (self : 'a t) : 'a =
-  match A.get self.st with
-  | Done (Ok x) -> x
-  | _ -> assert false
-
-let join_container_ ~iter ~map ~len cont : _ t =
-  let fut, promise = make () in
-  let missing = A.make (len cont) in
-
-  (* callback called when a future in [a] is resolved *)
-  let on_res = function
-    | Ok _ ->
-      let n = A.fetch_and_add missing (-1) in
-      if n = 1 then (
-        (* last future, we know they all succeeded, so resolve [fut] *)
-        let res = map peek_ok_assert_ cont in
-        fulfill promise (Ok res)
-      )
-    | Error e_bt ->
-      (* immediately cancel all other [on_res] *)
-      let n = A.exchange missing 0 in
-      if n > 0 then
-        (* we're the only one to set to 0, so we can fulfill [fut]
-           with an error. *)
-        fulfill promise (Error e_bt)
-  in
-
-  iter (fun fut -> on_result fut on_res) cont;
-  fut
-
-let join_array (a : _ t array) : _ array t =
-  match Array.length a with
-  | 0 -> return [||]
-  | 1 -> map ?on:None a.(1) ~f:(fun x -> [| x |])
-  | _ -> join_container_ ~len:Array.length ~map:Array.map ~iter:Array.iter a
-
-let join_list (l : _ t list) : _ list t =
-  match l with
-  | [] -> return []
-  | [ x ] -> map ?on:None x ~f:(fun x -> [ x ])
-  | _ -> join_container_ ~len:List.length ~map:List.map ~iter:List.iter l
-
-let wait_array (a : _ t array) : unit t =
-  join_container_ a ~iter:Array.iter ~len:Array.length ~map:(fun _f _ -> ())
-
-let wait_list (a : _ t list) : unit t =
-  join_container_ a ~iter:List.iter ~len:List.length ~map:(fun _f _ -> ())
-
-let for_ ~on n f : unit t =
-  let futs = Array.init n (fun i -> spawn ~on (fun () -> f i)) in
-  join_container_
-    ~len:(fun () -> n)
-    ~iter:(fun f () -> Array.iter f futs)
-    ~map:(fun _f () -> ())
-    ()
-
-(* ### blocking ### *)
-
-let wait_block (self : 'a t) : 'a or_error =
-  match A.get self.st with
-  | Done x -> x (* fast path *)
-  | Waiting _ ->
-    let real_block () =
-      (* use queue only once *)
-      let q = Bb_queue.create () in
-      on_result self (fun r -> Bb_queue.push q r);
-      Bb_queue.pop q
-    in
-
-    (* a bit of spinning before we block *)
-    let rec loop i =
-      if i = 0 then
-        real_block ()
-      else (
-        match A.get self.st with
-        | Done x -> x
-        | Waiting _ ->
-          Domain_.relax ();
-          (loop [@tailcall]) (i - 1)
-      )
-    in
-    loop 50
-
-let wait_block_exn self =
-  match wait_block self with
-  | Ok x -> x
-  | Error (e, bt) -> Printexc.raise_with_backtrace e bt
-
-module type INFIX = sig
-  val ( >|= ) : 'a t -> ('a -> 'b) -> 'b t
-  val ( >>= ) : 'a t -> ('a -> 'b t) -> 'b t
-  val ( let+ ) : 'a t -> ('a -> 'b) -> 'b t
-  val ( and+ ) : 'a t -> 'b t -> ('a * 'b) t
-  val ( let* ) : 'a t -> ('a -> 'b t) -> 'b t
-  val ( and* ) : 'a t -> 'b t -> ('a * 'b) t
-end
-
-module Infix_ (X : sig
-  val pool : Pool.t option
-end) : INFIX = struct
-  let[@inline] ( >|= ) x f = map ?on:X.pool ~f x
-  let[@inline] ( >>= ) x f = bind ?on:X.pool ~f x
-  let ( let+ ) = ( >|= )
-  let ( let* ) = ( >>= )
-  let ( and+ ) = both
-  let ( and* ) = both
-end
-
-module Infix_local = Infix_ (struct
-  let pool = None
-end)
-
-include Infix_local
-
-module Infix (X : sig
-  val pool : Pool.t
-end) =
-Infix_ (struct
-  let pool = Some X.pool
-end)
--- a/src/fut.mli
+++ b/src/fut.mli
@ -1,152 +0,0 @@
-(** Futures.
-
-    A future of type ['a t] represents the result of a computation
-    that will yield a value of type ['a].
-
-    Typically, the computation is running on a thread pool {!Pool.t}
-    and will proceed on some worker. Once set, a future cannot change.
-    It either succeeds (storing a [Ok x] with [x: 'a]), or fail
-    (storing a [Error (exn, bt)] with an exception and the corresponding
-    backtrace).
-
-    Combinators such as {!map} and {!join_array} can be used to produce
-    futures from other futures (in a monadic way). Some combinators take
-    a [pool] argument to specify where the intermediate computation takes
-    place; for example [map ~pool ~f fut] maps the value in [fut]
-    using function [f], applicatively; the call to [f] happens on
-    pool [pool] (once [fut] resolves successfully with a value).
-*)
-
-type 'a or_error = ('a, exn * Printexc.raw_backtrace) result
-
-type 'a t
-(** A future with a result of type ['a]. *)
-
-type 'a promise
-(** A promise, which can be fulfilled exactly once to set
-      the corresponding future *)
-
-val make : unit -> 'a t * 'a promise
-(** Make a new future with the associated promise *)
-
-val on_result : 'a t -> ('a or_error -> unit) -> unit
-(** [on_result fut f] registers [f] to be called in the future
-      when [fut] is set ;
-      or calls [f] immediately if [fut] is already set. *)
-
-exception Already_fulfilled
-
-val fulfill : 'a promise -> 'a or_error -> unit
-(** Fullfill the promise, setting the future at the same time.
-      @raise Already_fulfilled if the promise is already fulfilled. *)
-
-val fulfill_idempotent : 'a promise -> 'a or_error -> unit
-(** Fullfill the promise, setting the future at the same time.
-      Does nothing if the promise is already fulfilled. *)
-
-val return : 'a -> 'a t
-(** Already settled future, with a result *)
-
-val fail : exn -> Printexc.raw_backtrace -> _ t
-(** Already settled future, with a failure *)
-
-val of_result : 'a or_error -> 'a t
-
-val is_resolved : _ t -> bool
-(** [is_resolved fut] is [true] iff [fut] is resolved. *)
-
-val peek : 'a t -> 'a or_error option
-(** [peek fut] returns [Some r] if [fut] is currently resolved with [r],
-      and [None] if [fut] is not resolved yet. *)
-
-(** {2 Combinators} *)
-
-val spawn : on:Pool.t -> (unit -> 'a) -> 'a t
-(** [spaw ~on f] runs [f()] on the given pool, and return a future that will
-      hold its result. *)
-
-val map : ?on:Pool.t -> f:('a -> 'b) -> 'a t -> 'b t
-(** [map ?on ~f fut] returns a new future [fut2] that resolves
-      with [f x] if [fut] resolved with [x];
-      and fails with [e] if [fut] fails with [e] or [f x] raises [e].
-      @param on if provided, [f] runs on the given pool *)
-
-val bind : ?on:Pool.t -> f:('a -> 'b t) -> 'a t -> 'b t
-(** [map ?on ~f fut] returns a new future [fut2] that resolves
-      like the future [f x] if [fut] resolved with [x];
-    and fails with [e] if [fut] fails with [e] or [f x] raises [e].
-      @param on if provided, [f] runs on the given pool *)
-
-val both : 'a t -> 'b t -> ('a * 'b) t
-(** [both a b] succeeds with [x, y] if [a] succeeds with [x] and
-      [b] succeeds with [y], or fails if any of them fails. *)
-
-val choose : 'a t -> 'b t -> ('a, 'b) Either.t t
-(** [choose a b] succeeds [Left x] or [Right y] if [a] succeeds with [x] or
-      [b] succeeds with [y], or fails if both of them fails.
-      If they both succeed, it is not specified which result is used. *)
-
-val choose_same : 'a t -> 'a t -> 'a t
-(** [choose_same a b] succeeds with the value of one of [a] or [b] if
-      they succeed, or fails if both fail.
-      If they both succeed, it is not specified which result is used. *)
-
-val join_array : 'a t array -> 'a array t
-(** Wait for all the futures in the array. Fails if any future fails. *)
-
-val join_list : 'a t list -> 'a list t
-(** Wait for all the futures in the list. Fails if any future fails. *)
-
-val wait_array : _ t array -> unit t
-(** [wait_array arr] waits for all futures in [arr] to resolve. It discards
-      the individual results of futures in [arr]. It fails if any future fails. *)
-
-val wait_list : _ t list -> unit t
-(** [wait_list l] waits for all futures in [l] to resolve. It discards
-      the individual results of futures in [l]. It fails if any future fails. *)
-
-val for_ : on:Pool.t -> int -> (int -> unit) -> unit t
-(** [for_ ~on n f] runs [f 0], [f 1], …, [f (n-1)] on the pool, and returns
-      a future that resolves when all the tasks have resolved, or fails
-      as soon as one task has failed. *)
-
-(** {2 Blocking} *)
-
-val wait_block : 'a t -> 'a or_error
-(** [wait_block fut] blocks the current thread until [fut] is resolved,
-      and returns its value.
-
-      {b NOTE}: A word of warning: this will monopolize the calling thread until the future
-      resolves. This can also easily cause deadlocks, if enough threads in a pool
-      call [wait_block] on futures running on the same pool or a pool depending on it.
-
-      A good rule to avoid deadlocks is to run this from outside of any pool,
-      or to have an acyclic order between pools where [wait_block]
-      is only called from a pool on futures evaluated in a pool that comes lower
-      in the hierarchy.
-      If this rule is broken, it is possible for all threads in a pool to wait
-      for futures that can only make progress on these same threads,
-      hence the deadlock.
-  *)
-
-val wait_block_exn : 'a t -> 'a
-(** Same as {!wait_block} but re-raises the exception if the future failed. *)
-
-module type INFIX = sig
-  val ( >|= ) : 'a t -> ('a -> 'b) -> 'b t
-  val ( >>= ) : 'a t -> ('a -> 'b t) -> 'b t
-  val ( let+ ) : 'a t -> ('a -> 'b) -> 'b t
-  val ( and+ ) : 'a t -> 'b t -> ('a * 'b) t
-  val ( let* ) : 'a t -> ('a -> 'b t) -> 'b t
-  val ( and* ) : 'a t -> 'b t -> ('a * 'b) t
-end
-
-module Infix_local : INFIX
-(** Operators that run on the same thread as the first future. *)
-
-include INFIX
-
-(** Make infix combinators *)
-module Infix (_ : sig
-  val pool : Pool.t
-end) : INFIX
--- a/src/gen/dune
+++ b/src/gen/dune
@ -1,3 +0,0 @@
-
-(executable
-  (name gen))
--- a/src/gen/gen.ml
+++ b/src/gen/gen.ml
@ -1,107 +0,0 @@
-let atomic_pre_412 =
-  {|
-type 'a t = { mutable x: 'a }
-
-let[@inline] make x = { x }
-let[@inline] get { x } = x
-let[@inline] set r x = r.x <- x
-
-let[@inline never] exchange r x =
-  (* atomic *)
-  let y = r.x in
-  r.x <- x;
-  (* atomic *)
-  y
-
-let[@inline never] compare_and_set r seen v =
-  (* atomic *)
-  if r.x == seen then (
-    r.x <- v;
-    (* atomic *)
-    true
-  ) else
-    false
-
-let[@inline never] fetch_and_add r x =
-  (* atomic *)
-  let v = r.x in
-  r.x <- x + r.x;
-  (* atomic *)
-  v
-
-let[@inline never] incr r =
-  (* atomic *)
-  r.x <- 1 + r.x
-(* atomic *)
-
-let[@inline never] decr r =
-  (* atomic *)
-  r.x <- r.x - 1
-(* atomic *)
-
-|}
-
-let atomic_post_412 = {|
-include Atomic
-|}
-
-let domain_pre_5 =
-  {|
-let recommended_number () = 1
-
-type t = Thread.t
-
-let get_id (self:t) : int = Thread.id self
-
-let spawn f : t =
-  Thread.create f ()
-
-let relax () = Thread.yield ()
-|}
-
-let domain_post_5 =
-  {|
-let recommended_number () = Domain.recommended_domain_count ()
-
-type t = unit Domain.t
-
-let get_id (self:t) : int = (Domain.get_id self :> int)
-
-let spawn : _ -> t = Domain.spawn
-
-let relax = Domain.cpu_relax
-|}
-
-let p_version s = Scanf.sscanf s "%d.%d" (fun x y -> x, y)
-
-let () =
-  let atomic = ref false in
-  let domain = ref false in
-  let ocaml = ref Sys.ocaml_version in
-  Arg.parse
-    [
-      "--atomic", Arg.Set atomic, " atomic";
-      "--domain", Arg.Set domain, " domain";
-      "--ocaml", Arg.Set_string ocaml, " set ocaml version";
-    ]
-    ignore "";
-
-  let major, minor = p_version !ocaml in
-
-  if !atomic then (
-    let code =
-      if (major, minor) < (4, 12) then
-        atomic_pre_412
-      else
-        atomic_post_412
-    in
-    print_endline code
-  ) else if !domain then (
-    let code =
-      if (major, minor) < (5, 0) then
-        domain_pre_5
-      else
-        domain_post_5
-    in
-    print_endline code
-  )
--- a/src/lwt/dune
+++ b/src/lwt/dune
@ -0,0 +1,10 @@
+(library
+ (name moonpool_lwt)
+ (public_name moonpool-lwt)
+ (enabled_if
+  (>= %{ocaml_version} 5.0))
+ (libraries
+  (re_export moonpool)
+  picos
+  (re_export lwt)
+  lwt.unix))
--- a/src/lwt/moonpool_lwt.ml
+++ b/src/lwt/moonpool_lwt.ml
@ -0,0 +1,310 @@
+module Exn_bt = Moonpool.Exn_bt
+
+open struct
+  module WL = Moonpool.Private.Worker_loop_
+  module M = Moonpool
+end
+
+module Fut = Moonpool.Fut
+
+let on_uncaught_exn : (Moonpool.Exn_bt.t -> unit) ref =
+  ref (fun ebt ->
+      Printf.eprintf "uncaught exception in moonpool-lwt:\n%s" (Exn_bt.show ebt))
+
+module Scheduler_state = struct
+  type st = {
+    tasks: WL.task_full Queue.t;
+    actions_from_other_threads: (unit -> unit) Queue.t;
+        (** Other threads ask us to run closures in the lwt thread *)
+    mutex: Mutex.t;
+    mutable thread: int;
+    closed: bool Atomic.t;
+    cleanup_done: bool Atomic.t;
+    mutable as_runner: Moonpool.Runner.t;
+    mutable enter_hook: Lwt_main.Enter_iter_hooks.hook option;
+    mutable leave_hook: Lwt_main.Leave_iter_hooks.hook option;
+    mutable notification: int;
+        (** A lwt_unix notification to wake up the event loop *)
+    has_notified: bool Atomic.t;
+  }
+
+  (** Main state *)
+  let cur_st : st option Atomic.t = Atomic.make None
+
+  let create_new () : st =
+    {
+      tasks = Queue.create ();
+      actions_from_other_threads = Queue.create ();
+      mutex = Mutex.create ();
+      thread = Thread.id (Thread.self ());
+      closed = Atomic.make false;
+      cleanup_done = Atomic.make false;
+      as_runner = Moonpool.Runner.dummy;
+      enter_hook = None;
+      leave_hook = None;
+      notification = 0;
+      has_notified = Atomic.make false;
+    }
+
+  let[@inline] notify_ (self : st) : unit =
+    if not (Atomic.exchange self.has_notified true) then
+      Lwt_unix.send_notification self.notification
+
+  let[@inline never] add_action_from_another_thread_ (self : st) f : unit =
+    Mutex.lock self.mutex;
+    Queue.push f self.actions_from_other_threads;
+    Mutex.unlock self.mutex;
+    notify_ self
+
+  let[@inline] on_lwt_thread_ (self : st) : bool =
+    Thread.id (Thread.self ()) = self.thread
+
+  let[@inline] run_on_lwt_thread_ (self : st) (f : unit -> unit) : unit =
+    if on_lwt_thread_ self then
+      f ()
+    else
+      add_action_from_another_thread_ self f
+
+  let cleanup (st : st) =
+    match Atomic.get cur_st with
+    | Some st' ->
+      if st != st' then
+        failwith
+          "moonpool-lwt: cleanup failed (state is not the currently active \
+           one!)";
+      if not (on_lwt_thread_ st) then
+        failwith "moonpool-lwt: cleanup from the wrong thread";
+      Atomic.set st.closed true;
+      if not (Atomic.exchange st.cleanup_done true) then (
+        Option.iter Lwt_main.Enter_iter_hooks.remove st.enter_hook;
+        Option.iter Lwt_main.Leave_iter_hooks.remove st.leave_hook;
+        Lwt_unix.stop_notification st.notification
+      );
+
+      Atomic.set cur_st None
+    | None -> failwith "moonpool-lwt: cleanup failed (no current active state)"
+end
+
+module Ops = struct
+  type st = Scheduler_state.st
+
+  let schedule (self : st) t =
+    if Atomic.get self.closed then
+      failwith "moonpool-lwt.schedule: scheduler is closed";
+    Scheduler_state.run_on_lwt_thread_ self (fun () -> Queue.push t self.tasks)
+
+  let get_next_task (self : st) =
+    if Atomic.get self.closed then raise WL.No_more_tasks;
+    try Queue.pop self.tasks with Queue.Empty -> raise WL.No_more_tasks
+
+  let on_exn _ ebt = !on_uncaught_exn ebt
+  let runner (self : st) = self.as_runner
+  let cleanup = Scheduler_state.cleanup
+
+  let as_runner (self : st) : Moonpool.Runner.t =
+    Moonpool.Runner.For_runner_implementors.create
+      ~size:(fun () -> 1)
+      ~num_tasks:(fun () ->
+        Mutex.lock self.mutex;
+        let n = Queue.length self.tasks in
+        Mutex.unlock self.mutex;
+        n)
+      ~run_async:(fun ~fiber f -> schedule self @@ WL.T_start { fiber; f })
+      ~shutdown:(fun ~wait:_ () -> Atomic.set self.closed true)
+      ()
+
+  let before_start (self : st) : unit =
+    self.as_runner <- as_runner self;
+    ()
+
+  let ops : st WL.ops =
+    { schedule; get_next_task; on_exn; runner; before_start; cleanup }
+
+  let setup st =
+    if Atomic.compare_and_set Scheduler_state.cur_st None (Some st) then
+      before_start st
+    else
+      failwith "moonpool-lwt: setup failed (state already in place)"
+end
+
+(** Resolve [prom] with the result of [lwt_fut] *)
+let transfer_lwt_to_fut (lwt_fut : 'a Lwt.t) (prom : 'a Fut.promise) : unit =
+  Lwt.on_any lwt_fut
+    (fun x -> M.Fut.fulfill_idempotent prom (Ok x))
+    (fun exn ->
+      let bt = Printexc.get_callstack 10 in
+      M.Fut.fulfill_idempotent prom (Error (Exn_bt.make exn bt)))
+
+let[@inline] register_trigger_on_lwt_termination (lwt_fut : _ Lwt.t)
+    (tr : M.Trigger.t) : unit =
+  Lwt.on_termination lwt_fut (fun _ -> M.Trigger.signal tr)
+
+let[@inline] await_lwt_terminated (fut : _ Lwt.t) =
+  match Lwt.state fut with
+  | Return x -> x
+  | Fail exn -> raise exn
+  | Sleep -> assert false
+
+module Main_state = struct
+  let[@inline] get_st () : Scheduler_state.st =
+    match Atomic.get Scheduler_state.cur_st with
+    | Some st ->
+      if Atomic.get st.closed then failwith "moonpool-lwt: scheduler is closed";
+      st
+    | None -> failwith "moonpool-lwt: scheduler is not setup"
+
+  let[@inline] run_on_lwt_thread f =
+    Scheduler_state.run_on_lwt_thread_ (get_st ()) f
+
+  let[@inline] on_lwt_thread () : bool =
+    Scheduler_state.on_lwt_thread_ (get_st ())
+
+  let[@inline] add_action_from_another_thread f : unit =
+    Scheduler_state.add_action_from_another_thread_ (get_st ()) f
+end
+
+let await_lwt_from_another_thread fut =
+  let tr = M.Trigger.create () in
+  Main_state.add_action_from_another_thread (fun () ->
+      register_trigger_on_lwt_termination fut tr);
+  M.Trigger.await_exn tr;
+  await_lwt_terminated fut
+
+let await_lwt (fut : _ Lwt.t) =
+  if Scheduler_state.on_lwt_thread_ (Main_state.get_st ()) then (
+    (* can directly access the future *)
+      match Lwt.state fut with
+    | Return x -> x
+    | Fail exn -> raise exn
+    | Sleep ->
+      let tr = M.Trigger.create () in
+      register_trigger_on_lwt_termination fut tr;
+      M.Trigger.await_exn tr;
+      await_lwt_terminated fut
+  ) else
+    await_lwt_from_another_thread fut
+
+let lwt_of_fut (fut : 'a M.Fut.t) : 'a Lwt.t =
+  if not (Main_state.on_lwt_thread ()) then
+    failwith "lwt_of_fut: not on the lwt thread";
+  let lwt_fut, lwt_prom = Lwt.wait () in
+
+  (* in lwt thread, resolve [lwt_fut] *)
+  let wakeup_using_res = function
+    | Ok x -> Lwt.wakeup lwt_prom x
+    | Error ebt ->
+      let exn = Exn_bt.exn ebt in
+      Lwt.wakeup_exn lwt_prom exn
+  in
+
+  M.Fut.on_result fut (fun res ->
+      Main_state.run_on_lwt_thread (fun () ->
+          (* can safely wakeup from the lwt thread *)
+          wakeup_using_res res));
+
+  lwt_fut
+
+let fut_of_lwt (lwt_fut : _ Lwt.t) : _ M.Fut.t =
+  if Main_state.on_lwt_thread () then (
+    match Lwt.state lwt_fut with
+    | Return x -> M.Fut.return x
+    | _ ->
+      let fut, prom = M.Fut.make () in
+      transfer_lwt_to_fut lwt_fut prom;
+      fut
+  ) else (
+    let fut, prom = M.Fut.make () in
+    Main_state.add_action_from_another_thread (fun () ->
+        transfer_lwt_to_fut lwt_fut prom);
+    fut
+  )
+
+module Setup_lwt_hooks (ARG : sig
+  val st : Scheduler_state.st
+end) =
+struct
+  open ARG
+
+  module FG =
+    WL.Fine_grained
+      (struct
+        include Scheduler_state
+
+        let st = st
+        let ops = Ops.ops
+      end)
+      ()
+
+  let run_in_hook () =
+    (* execute actions sent from other threads; first transfer them
+     all atomically to a local queue to reduce contention *)
+    let local_acts = Queue.create () in
+    Mutex.lock st.mutex;
+    Queue.transfer st.actions_from_other_threads local_acts;
+    Atomic.set st.has_notified false;
+    Mutex.unlock st.mutex;
+
+    Queue.iter (fun f -> f ()) local_acts;
+
+    (* run tasks *)
+    FG.run ~max_tasks:1000 ();
+
+    if not (Queue.is_empty st.tasks) then ignore (Lwt.pause () : unit Lwt.t);
+    ()
+
+  let setup () =
+    (* only one thread does this *)
+    FG.setup ~block_signals:false ();
+
+    st.thread <- Thread.self () |> Thread.id;
+    st.enter_hook <- Some (Lwt_main.Enter_iter_hooks.add_last run_in_hook);
+    st.leave_hook <- Some (Lwt_main.Leave_iter_hooks.add_last run_in_hook);
+    (* notification used to wake lwt up *)
+    st.notification <- Lwt_unix.make_notification ~once:false run_in_hook
+end
+
+let setup () : Scheduler_state.st =
+  let st = Scheduler_state.create_new () in
+  Ops.setup st;
+  let module Setup_lwt_hooks' = Setup_lwt_hooks (struct
+    let st = st
+  end) in
+  Setup_lwt_hooks'.setup ();
+  st
+
+let[@inline] is_setup () = Option.is_some @@ Atomic.get Scheduler_state.cur_st
+
+let spawn_lwt f : _ Lwt.t =
+  let st = Main_state.get_st () in
+  let lwt_fut, lwt_prom = Lwt.wait () in
+  M.run_async st.as_runner (fun () ->
+      try
+        let x = f () in
+        Lwt.wakeup lwt_prom x
+      with exn -> Lwt.wakeup_exn lwt_prom exn);
+  lwt_fut
+
+let spawn_lwt_ignore f = ignore (spawn_lwt f : unit Lwt.t)
+let on_lwt_thread = Main_state.on_lwt_thread
+
+let run_in_lwt_and_await (f : unit -> 'a) : 'a =
+  let st = Main_state.get_st () in
+  if Scheduler_state.on_lwt_thread_ st then
+    (* run immediately *)
+    f ()
+  else
+    await_lwt_from_another_thread @@ spawn_lwt f
+
+let lwt_main (f : _ -> 'a) : 'a =
+  let st = setup () in
+  (* make sure to cleanup *)
+  let finally () = Scheduler_state.cleanup st in
+  Fun.protect ~finally @@ fun () ->
+  let fut = spawn_lwt (fun () -> f st.as_runner) in
+  (* make sure the scheduler isn't already sleeping *)
+  Scheduler_state.notify_ st;
+  Lwt_main.run fut
+
+let[@inline] lwt_main_runner () =
+  let st = Main_state.get_st () in
+  st.as_runner
--- a/src/lwt/moonpool_lwt.mli
+++ b/src/lwt/moonpool_lwt.mli
@ -0,0 +1,73 @@
+(** Lwt_engine-based event loop for Moonpool.
+
+    In what follows, we mean by "lwt thread" the thread running {!lwt_main}
+    (which wraps [Lwt_main.run]; so, the thread where the Lwt event loop and all
+    Lwt callbacks execute).
+
+    {b NOTE}: this is experimental and might change in future versions.
+
+    @since 0.6
+
+    The API has entirely changed since 0.9 , see
+    https://github.com/c-cube/moonpool/pull/37 *)
+
+module Fut = Moonpool.Fut
+
+(** {2 Basic conversions} *)
+
+val fut_of_lwt : 'a Lwt.t -> 'a Moonpool.Fut.t
+(** [fut_of_lwt lwt_fut] makes a thread-safe moonpool future that completes when
+    [lwt_fut] does. This can be run from any thread. *)
+
+val lwt_of_fut : 'a Moonpool.Fut.t -> 'a Lwt.t
+(** [lwt_of_fut fut] makes a lwt future that completes when [fut] does. This
+    must be called from the Lwt thread, and the result must always be used only
+    from inside the Lwt thread.
+    @raise Failure if not run from the lwt thread. *)
+
+(** {2 Helpers on the moonpool side} *)
+
+val spawn_lwt : (unit -> 'a) -> 'a Lwt.t
+(** This spawns a task that runs in the Lwt scheduler. This function is thread
+    safe.
+    @raise Failure if {!lwt_main} was not called. *)
+
+val spawn_lwt_ignore : (unit -> unit) -> unit
+(** Like {!spawn_lwt} but ignores the result, like [Lwt.async]. This function is
+    thread safe. *)
+
+val await_lwt : 'a Lwt.t -> 'a
+(** [await_lwt fut] awaits a Lwt future from inside a task running on a moonpool
+    runner. This must be run from within a Moonpool runner so that the await-ing
+    effect is handled, but it doesn't have to run from inside the Lwt thread. *)
+
+val run_in_lwt_and_await : (unit -> 'a) -> 'a
+(** [run_in_lwt_and_await f] runs [f()] in the lwt thread, just like
+    [spawn_lwt f], and then calls {!await_lwt} on the result. This means [f()]
+    can use Lwt functions and libraries, use {!await_lwt} on them freely, etc.
+
+    This function must run from within a task running on a moonpool runner so
+    that it can [await_lwt]. *)
+
+(** {2 Wrappers around Lwt_main} *)
+
+val on_uncaught_exn : (Moonpool.Exn_bt.t -> unit) ref
+(** Exception handler for tasks that raise an uncaught exception. *)
+
+val lwt_main : (Moonpool.Runner.t -> 'a) -> 'a
+(** [lwt_main f] sets the moonpool-lwt bridge up, runs lwt main, calls [f],
+    destroys the bridge, and return the result of [f()]. Only one thread should
+    call this at a time. *)
+
+val on_lwt_thread : unit -> bool
+(** [on_lwt_thread ()] is true if the current thread is the one currently
+    running {!lwt_main}. This is thread safe.
+    @raise Failure if {!lwt_main} was not called. *)
+
+val lwt_main_runner : unit -> Moonpool.Runner.t
+(** The runner from {!lwt_main}. The runner is only going to work if {!lwt_main}
+    is currently running in some thread. This is thread safe.
+    @raise Failure if {!lwt_main} was not called. *)
+
+val is_setup : unit -> bool
+(** Is the moonpool-lwt bridge setup? This is thread safe. *)
--- a/src/moonpool.ml
+++ b/src/moonpool.ml
@ -1,8 +0,0 @@
-let start_thread_on_some_domain f x =
-  let did = Random.int (D_pool_.n_domains ()) in
-  D_pool_.run_on_and_wait did (fun () -> Thread.create f x)
-
-module Pool = Pool
-module Fut = Fut
-module Blocking_queue = Bb_queue
-module Atomic = Atomic_
--- a/src/moonpool.mli
+++ b/src/moonpool.mli
@ -1,70 +0,0 @@
-(** Moonpool
-
-  A pool within a bigger pool (ie the ocean). Here, we're talking about
-  pools of [Thread.t] which live within a fixed pool of [Domain.t].
-*)
-
-module Pool = Pool
-
-val start_thread_on_some_domain : ('a -> unit) -> 'a -> Thread.t
-(** Similar to {!Thread.create}, but it picks a background domain at random
-    to run the thread. This ensures that we don't always pick the same domain
-    to run all the various threads needed in an application (timers, event loops, etc.) *)
-
-module Fut = Fut
-
-(** A simple blocking queue.
-
-    This queue is quite basic and will not behave well under heavy
-    contention. However, it can be sufficient for many practical use cases.
-
-    {b NOTE}: this queue will typically block the caller thread
-    in case the operation (push/pop) cannot proceed.
-    Be wary of deadlocks when using the queue {i from} a pool
-    when you expect the other end to also be produced/consumed from
-    the same pool.
-
-    See discussion on {!Fut.wait_block} for more details on deadlocks
-    and how to mitigate the risk of running into them.
-
-    More scalable queues can be found in
-    Lockfree (https://github.com/ocaml-multicore/lockfree/)
-*)
-module Blocking_queue : sig
-  type 'a t
-  (** Unbounded blocking queue.
-
-      This queue is thread-safe and will block when calling {!pop}
-      on it when it's empty. *)
-
-  val create : unit -> _ t
-  (** Create a new unbounded queue. *)
-
-  exception Closed
-
-  val push : 'a t -> 'a -> unit
-  (** [push q x] pushes [x] into [q], and returns [()].
-
-      In the current implementation, [push q] will never block for
-      a long time, it will only block while waiting for a lock
-      so it can push the element.
-      @raise Closed if the queue is closed (by a previous call to [close q]) *)
-
-  val pop : 'a t -> 'a
-  (** [pop q] pops the next element in [q]. It might block until an element comes.
-      @raise Closed if the queue was closed before a new element was available. *)
-
-  val close : _ t -> unit
-  (** Close the queue, meaning there won't be any more [push] allowed,
-      ie [push] will raise {!Closed}.
-
-      [pop] will keep working and will return the elements present in the
-      queue, until it's entirely drained; then [pop] will
-      also raise {!Closed}. *)
-end
-
-module Atomic = Atomic_
-(** Atomic values.
-
-    This is either a shim using [ref], on pre-OCaml 5, or the
-    standard [Atomic] module on OCaml 5. *)
--- a/src/pool.ml
+++ b/src/pool.ml
@ -1,123 +0,0 @@
-(* TODO: use a better queue for the tasks *)
-
-module A = Atomic_
-
-type t = {
-  active: bool A.t;
-  threads: Thread.t array;
-  q: (unit -> unit) Bb_queue.t;
-}
-
-type thread_loop_wrapper =
-  thread:Thread.t -> pool:t -> (unit -> unit) -> unit -> unit
-
-let global_thread_wrappers_ : thread_loop_wrapper list A.t = A.make []
-
-let add_global_thread_loop_wrapper f : unit =
-  while
-    let l = A.get global_thread_wrappers_ in
-    not (A.compare_and_set global_thread_wrappers_ l (f :: l))
-  do
-    Domain_.relax ()
-  done
-
-exception Shutdown
-
-let[@inline] run self f : unit =
-  try Bb_queue.push self.q f with Bb_queue.Closed -> raise Shutdown
-
-let size self = Array.length self.threads
-
-let worker_thread_ ~on_exn (active : bool A.t) (q : _ Bb_queue.t) : unit =
-  while A.get active do
-    match Bb_queue.pop q with
-    | exception Bb_queue.Closed -> ()
-    | task ->
-      (try task ()
-       with e ->
-         let bt = Printexc.get_raw_backtrace () in
-         on_exn e bt)
-  done
-
-let default_thread_init_exit_ ~dom_id:_ ~t_id:_ () = ()
-
-let create ?(on_init_thread = default_thread_init_exit_)
-    ?(on_exit_thread = default_thread_init_exit_) ?(thread_wrappers = [])
-    ?(on_exn = fun _ _ -> ()) ?(min = 1) ?(per_domain = 0) () : t =
-  (* number of threads to run *)
-  let min = max 1 min in
-  let n_domains = D_pool_.n_domains () in
-  assert (n_domains >= 1);
-  let n = max min (n_domains * per_domain) in
-
-  (* make sure we don't bias towards the first domain(s) in {!D_pool_} *)
-  let offset = Random.int n_domains in
-
-  let active = A.make true in
-  let q = Bb_queue.create () in
-
-  let pool =
-    let dummy = Thread.self () in
-    { active; threads = Array.make n dummy; q }
-  in
-
-  (* temporary queue used to obtain thread handles from domains
-     on which the thread are started. *)
-  let receive_threads = Bb_queue.create () in
-
-  (* start the thread with index [i] *)
-  let start_thread_with_idx i =
-    let dom_idx = (offset + i) mod n_domains in
-
-    (* function run in the thread itself *)
-    let main_thread_fun () =
-      let thread = Thread.self () in
-      let t_id = Thread.id thread in
-      on_init_thread ~dom_id:dom_idx ~t_id ();
-
-      let all_wrappers =
-        List.rev_append thread_wrappers (A.get global_thread_wrappers_)
-      in
-
-      let run () = worker_thread_ ~on_exn active q in
-      (* the actual worker loop is [worker_thread_], with all
-         wrappers for this pool and for all pools (global_thread_wrappers_) *)
-      let run' =
-        List.fold_left (fun run f -> f ~thread ~pool run) run all_wrappers
-      in
-
-      (* now run the main loop *)
-      run' ();
-      on_exit_thread ~dom_id:dom_idx ~t_id ()
-    in
-
-    (* function called in domain with index [i], to
-       create the thread and push it into [receive_threads] *)
-    let create_thread_in_domain () =
-      let thread = Thread.create main_thread_fun () in
-      (* send the thread from the domain back to us *)
-      Bb_queue.push receive_threads (i, thread)
-    in
-
-    D_pool_.run_on dom_idx create_thread_in_domain
-  in
-
-  (* start all threads, placing them on the domains
-     according to their index and [offset] in a round-robin fashion. *)
-  for i = 0 to n - 1 do
-    start_thread_with_idx i
-  done;
-
-  (* receive the newly created threads back from domains *)
-  for _j = 1 to n do
-    let i, th = Bb_queue.pop receive_threads in
-    pool.threads.(i) <- th
-  done;
-  pool
-
-let shutdown (self : t) : unit =
-  let was_active = A.exchange self.active false in
-  (* close the job queue, which will fail future calls to [run],
-     and wake up the subset of [self.threads] that are waiting on it. *)
-  if was_active then Bb_queue.close self.q;
-  Array.iter Thread.join self.threads
--- a/src/pool.mli
+++ b/src/pool.mli
@ -1,56 +0,0 @@
-(** Thread pool. *)
-
-type t
-(** A pool of threads. The pool contains a fixed number of threads that
-    wait for work items to come, process these, and loop.
-
-    If a pool is no longer needed, {!shutdown} can be used to signal all threads
-    in it to stop (after they finish their work), and wait for them to stop.
-
-    The threads are distributed across a fixed domain pool
-    (whose size is determined by {!Domain.recommended_domain_count} on OCaml 5, and
-    simple the single runtime on OCaml 4). *)
-
-type thread_loop_wrapper =
-  thread:Thread.t -> pool:t -> (unit -> unit) -> unit -> unit
-(** a thread wrapper [f] takes the current thread, the current pool,
-      and the worker function [loop : unit -> unit] which is
-      the worker's main loop, and returns a new loop function.
-      By default it just returns the same loop function but it can be used
-      to install tracing, effect handlers, etc. *)
-
-val add_global_thread_loop_wrapper : thread_loop_wrapper -> unit
-(** [add_global_thread_loop_wrapper f] installs [f] to be installed in every new pool worker
-      thread, for all existing pools, and all new pools created with [create].
-      These wrappers accumulate: they all apply, but their order is not specified. *)
-
-val create :
-  ?on_init_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
-  ?on_exit_thread:(dom_id:int -> t_id:int -> unit -> unit) ->
-  ?thread_wrappers:thread_loop_wrapper list ->
-  ?on_exn:(exn -> Printexc.raw_backtrace -> unit) ->
-  ?min:int ->
-  ?per_domain:int ->
-  unit ->
-  t
-(** [create ()] makes a new thread pool.
-     @param on_init_thread called at the beginning of each new thread
-       in the pool.
-       @param on_exit_thread called at the end of each thread in the pool
-       @param thread_wrappers a list of {!thread_loop_wrapper} functions
-       to use for this pool's workers.
-  *)
-
-val size : t -> int
-(** Number of threads *)
-
-val shutdown : t -> unit
-(** Shutdown the pool and wait for it to terminate. Idempotent. *)
-
-exception Shutdown
-
-val run : t -> (unit -> unit) -> unit
-(** [run pool f] schedules [f] for later execution on the pool
-    in one of the threads. [f()] will run on one of the pool's
-    worker threads.
-    @raise Shutdown if the pool was shut down before [run] was called. *)
--- a/src/private/domain_.ml
+++ b/src/private/domain_.ml
@ -0,0 +1,11 @@
+[@@@ocaml.alert "-unstable"]
+
+let recommended_number () = Domain.recommended_domain_count ()
+
+type t = unit Domain.t
+
+let get_id (self : t) : int = (Domain.get_id self :> int)
+let spawn : _ -> t = Domain.spawn
+let relax = Domain.cpu_relax
+let join = Domain.join
+let is_main_domain = Domain.is_main_domain
--- a/src/private/dune
+++ b/src/private/dune
@ -0,0 +1,12 @@
+(library
+ (name moonpool_private)
+ (public_name moonpool.private)
+ (synopsis "Private internal utils for Moonpool (do not rely on)")
+ (libraries
+  threads
+  either
+  (select
+   tracing_.ml
+   from
+   (trace.core -> tracing_.real.ml)
+   (-> tracing_.dummy.ml))))
--- a/src/private/signals_.ml
+++ b/src/private/signals_.ml
@ -0,0 +1,15 @@
+let ignore_signals_ () =
+  try
+    Thread.sigmask SIG_BLOCK
+      [
+        Sys.sigpipe;
+        Sys.sigbus;
+        Sys.sigterm;
+        Sys.sigchld;
+        Sys.sigalrm;
+        Sys.sigint;
+        Sys.sigusr1;
+        Sys.sigusr2;
+      ]
+    |> ignore
+  with _ -> ()
--- a/src/private/tracing_.dummy.ml
+++ b/src/private/tracing_.dummy.ml
@ -0,0 +1,6 @@
+let enabled () = false
+let dummy_span = 0L
+let enter_span _name = dummy_span
+let exit_span = ignore
+let set_thread_name = ignore
+let with_span _ f = f dummy_span
--- a/src/private/tracing_.mli
+++ b/src/private/tracing_.mli
@ -0,0 +1,6 @@
+val dummy_span : int64
+val enter_span : string -> int64
+val exit_span : int64 -> unit
+val with_span : string -> (int64 -> 'a) -> 'a
+val enabled : unit -> bool
+val set_thread_name : string -> unit
--- a/src/private/tracing_.real.ml
+++ b/src/private/tracing_.real.ml
@ -0,0 +1,25 @@
+module Trace = Trace_core
+
+let enabled = Trace.enabled
+let dummy_span = Int64.min_int
+let dummy_file_ = "<unknown file>"
+let set_thread_name = Trace.set_thread_name
+
+let[@inline] enter_span name : int64 =
+  if name = "" then
+    dummy_span
+  else
+    Trace.enter_span ~__FILE__:dummy_file_ ~__LINE__:0 name
+
+let[@inline] exit_span sp = if sp <> dummy_span then Trace.exit_span sp
+
+let with_span name f =
+  let sp = enter_span name in
+  try
+    let x = f sp in
+    exit_span sp;
+    x
+  with exn ->
+    let bt = Printexc.get_raw_backtrace () in
+    exit_span sp;
+    Printexc.raise_with_backtrace exn bt
--- a/src/private/ws_deque_.ml
+++ b/src/private/ws_deque_.ml
@ -0,0 +1,129 @@
+module A = Atomic
+
+(* terminology:
+
+   - Bottom: where we push/pop normally. Only one thread can do that.
+   - top: where work stealing happens (older values).
+     This only ever grows.
+
+   Elements are always added on the bottom end. *)
+
+(** Circular array (size is [2 ^ log_size]) *)
+module CA : sig
+  type 'a t
+
+  val create : dummy:'a -> unit -> 'a t
+  val size : 'a t -> int
+  val get : 'a t -> int -> 'a
+  val set : 'a t -> int -> 'a -> unit
+end = struct
+  (** The array has size 256. *)
+  let log_size = 8
+
+  type 'a t = { arr: 'a array } [@@unboxed]
+
+  let[@inline] size (_self : _ t) = 1 lsl log_size
+  let create ~dummy () : _ t = { arr = Array.make (1 lsl log_size) dummy }
+
+  let[@inline] get (self : 'a t) (i : int) : 'a =
+    Array.unsafe_get self.arr (i land ((1 lsl log_size) - 1))
+
+  let[@inline] set (self : 'a t) (i : int) (x : 'a) : unit =
+    Array.unsafe_set self.arr (i land ((1 lsl log_size) - 1)) x
+end
+
+type 'a t = {
+  top: int A.t;  (** Where we steal *)
+  bottom: int A.t;  (** Where we push/pop from the owning thread *)
+  mutable top_cached: int;  (** Last read value of [top] *)
+  arr: 'a CA.t;  (** The circular array *)
+}
+
+let create ~dummy () : _ t =
+  let top = A.make 0 in
+  let arr = CA.create ~dummy () in
+  (* allocate far from [top] to avoid false sharing *)
+  let bottom = A.make 0 in
+  { top; top_cached = 0; bottom; arr }
+
+let[@inline] size (self : _ t) : int = max 0 (A.get self.bottom - A.get self.top)
+
+exception Full
+
+let push (self : 'a t) (x : 'a) : bool =
+  try
+    let b = A.get self.bottom in
+    let t_approx = self.top_cached in
+
+    (* Section 2.3: over-approximation of size.
+       Only if it seems too big do we actually read [t]. *)
+    let size_approx = b - t_approx in
+    if size_approx >= CA.size self.arr - 1 then (
+      (* we need to read the actual value of [top], which might entail contention. *)
+      let t = A.get self.top in
+      self.top_cached <- t;
+      let size = b - t in
+
+      if size >= CA.size self.arr - 1 then (* full! *) raise_notrace Full
+    );
+
+    CA.set self.arr b x;
+    A.set self.bottom (b + 1);
+    true
+  with Full -> false
+
+exception Empty
+
+let pop_exn (self : 'a t) : 'a =
+  let b = A.get self.bottom in
+  let b = b - 1 in
+  A.set self.bottom b;
+
+  let t = A.get self.top in
+  self.top_cached <- t;
+
+  let size = b - t in
+  if size < 0 then (
+    (* reset to basic empty state *)
+    A.set self.bottom t;
+    raise_notrace Empty
+  ) else if size > 0 then (
+    (* can pop without modifying [top] *)
+    let x = CA.get self.arr b in
+    x
+  ) else (
+    assert (size = 0);
+    (* there was exactly one slot, so we might be racing against stealers
+       to update [self.top] *)
+    if A.compare_and_set self.top t (t + 1) then (
+      let x = CA.get self.arr b in
+      A.set self.bottom (t + 1);
+      x
+    ) else (
+      A.set self.bottom (t + 1);
+      raise_notrace Empty
+    )
+  )
+
+let[@inline] pop self : _ option =
+  match pop_exn self with
+  | exception Empty -> None
+  | t -> Some t
+
+let steal (self : 'a t) : 'a option =
+  (* read [top], but do not update [top_cached]
+     as we're in another thread *)
+  let t = A.get self.top in
+  let b = A.get self.bottom in
+
+  let size = b - t in
+  if size <= 0 then
+    None
+  else (
+    let x = CA.get self.arr t in
+    if A.compare_and_set self.top t (t + 1) then
+      (* successfully increased top to consume [x] *)
+      Some x
+    else
+      None
+  )
--- a/src/private/ws_deque_.mli
+++ b/src/private/ws_deque_.mli
@ -0,0 +1,30 @@
+(** Work-stealing deque.
+
+    Adapted from "Dynamic circular work stealing deque", Chase & Lev.
+
+    However note that this one is not dynamic in the sense that there is no
+    resizing. Instead we return [false] when [push] fails, which keeps the
+    implementation fairly lightweight. *)
+
+type 'a t
+(** Deque containing values of type ['a] *)
+
+val create : dummy:'a -> unit -> 'a t
+(** Create a new deque. *)
+
+val push : 'a t -> 'a -> bool
+(** Push value at the bottom of deque. returns [true] if it succeeds. This must
+    be called only by the owner thread. *)
+
+val pop : 'a t -> 'a option
+(** Pop value from the bottom of deque. This must be called only by the owner
+    thread. *)
+
+exception Empty
+
+val pop_exn : 'a t -> 'a
+
+val steal : 'a t -> 'a option
+(** Try to steal from the top of deque. This is thread-safe. *)
+
+val size : _ t -> int
--- a/test/data/d1/large
+++ b/test/data/d1/large
@ -0,0 +1 @@
+bigdata 1-20
--- a/test/data/d1/large_1
+++ b/test/data/d1/large_1
@ -0,0 +1 @@
+bigdata 1-1
--- a/test/data/d1/large_10
+++ b/test/data/d1/large_10
@ -0,0 +1 @@
+bigdata 1-10
--- a/test/data/d1/large_11
+++ b/test/data/d1/large_11
@ -0,0 +1 @@
+bigdata 1-11
--- a/test/data/d1/large_12
+++ b/test/data/d1/large_12
@ -0,0 +1 @@
+bigdata 1-12
--- a/test/data/d1/large_13
+++ b/test/data/d1/large_13
@ -0,0 +1 @@
+bigdata 1-13
--- a/test/data/d1/large_14
+++ b/test/data/d1/large_14
@ -0,0 +1 @@
+bigdata 1-14
--- a/test/data/d1/large_15
+++ b/test/data/d1/large_15
@ -0,0 +1 @@
+bigdata 1-15
--- a/test/data/d1/large_16
+++ b/test/data/d1/large_16
@ -0,0 +1 @@
+bigdata 1-16
--- a/test/data/d1/large_17
+++ b/test/data/d1/large_17
@ -0,0 +1 @@
+bigdata 1-17
--- a/test/data/d1/large_18
+++ b/test/data/d1/large_18
@ -0,0 +1 @@
+bigdata 1-18
--- a/test/data/d1/large_19
+++ b/test/data/d1/large_19
@ -0,0 +1 @@
+bigdata 1-19
--- a/test/data/d1/large_2
+++ b/test/data/d1/large_2
@ -0,0 +1 @@
+bigdata 1-2
--- a/test/data/d1/large_20
+++ b/test/data/d1/large_20
@ -0,0 +1 @@
+bigdata 1-20
--- a/Show more
+++ b/Show more