From 7b7eda5a05c131be988e43a674c448c6bf4c7bc1 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Fri, 5 Jan 2024 21:47:09 -0500 Subject: [PATCH] wip: persistent vectors based on clojure's --- src/pvec/containers_pvec.ml | 368 +++++++++++++++++++++++++++++++++++ src/pvec/containers_pvec.mli | 84 ++++++++ src/pvec/dune | 8 + 3 files changed, 460 insertions(+) create mode 100644 src/pvec/containers_pvec.ml create mode 100644 src/pvec/containers_pvec.mli create mode 100644 src/pvec/dune diff --git a/src/pvec/containers_pvec.ml b/src/pvec/containers_pvec.ml new file mode 100644 index 00000000..171733c9 --- /dev/null +++ b/src/pvec/containers_pvec.ml @@ -0,0 +1,368 @@ +(* HAMT. + We follow https://hypirion.com/musings/understanding-persistent-vector-pt-1 + and following posts. *) + +type 'a iter = ('a -> unit) -> unit + +let num_bits = 2 +let branching_factor = 1 lsl num_bits +let bitmask = branching_factor - 1 + +(** Short array with functional semantics *) +module A = struct + open Array + + type 'a t = 'a array + + let length = length + let get = get + let[@inline] is_empty self = self = [||] + let[@inline] return self = [| self |] + let[@inline] is_full self = length self = branching_factor + + let[@inline] push (self : _ t) x = + let n = length self in + if n = branching_factor then invalid_arg "Pvec.push"; + let arr = Array.make (n + 1) x in + Array.blit self 0 arr 0 n; + arr + + let[@inline] pop self : _ t = + let n = length self in + if n = 0 then invalid_arg "Pvec.pop"; + Array.sub self 0 (n - 1) + + (* TODO: remove *) + let set ~mut (self : _ t) i x : _ t = + if i < 0 || i > length self || i >= branching_factor then + invalid_arg "Pvec.set"; + if i = length self then ( + (* insert in a longer copy *) + let arr = Array.make (i + 1) x in + Array.blit self 0 arr 0 i; + arr + ) else if mut then ( + (* replace element at [i] in place *) + self.(i) <- x; + self + ) else ( + (* replace element at [i] in copy *) + let arr = Array.copy self in + arr.(i) <- x; + arr + ) +end + +type 'a tree = Empty | Node of 'a tree A.t | Leaf of 'a A.t + +type 'a t = { + t: 'a tree; (** The 32-way tree *) + size: int; (** Exact number of elements *) + shift: int; (** num_bits*(depth of tree) *) + tail: 'a A.t; (** Tail array, for fast push/pop *) +} +(* invariants: + - if size>0 then [not (is_empty tail)] + - all leaves in [t] are at depth shift/5 +*) + +(* FIXME: remove *) +let dbg_ = ref (fun _ _ _ -> assert false) +let empty_tree = Empty +let empty = { t = empty_tree; size = 0; shift = 0; tail = [||] } + +let[@inline] is_empty_tree = function + | Empty -> true + | _ -> false + +let[@inline] is_empty (self : _ t) = self.size = 0 +let[@inline] length (self : _ t) = self.size +let[@inline] return x = { empty with size = 1; tail = A.return x } +let[@inline] tail_off (self : _ t) : int = self.size - A.length self.tail + +let[@unroll 2] rec get_tree_ (self : 'a tree) (shift : int) i : 'a = + match self with + | Empty -> invalid_arg "pvec.get" + | Leaf a -> A.get a (i land bitmask) + | Node a -> + let idx = (i lsr shift) land bitmask in + get_tree_ (A.get a idx) (shift - num_bits) i + +let get (self : 'a t) (i : int) : 'a = + if i < 0 then + invalid_arg "pvec.get" + else ( + let tail_off = self.size - A.length self.tail in + if i >= tail_off then + A.get self.tail (i - tail_off) + else + get_tree_ self.t self.shift i + ) + +let[@inline] get_opt self i = + try Some (get self i) with Invalid_argument _ -> None + +(** Build a tree leading to [tail] with indices 0 at each node *) +let rec build_new_tail_spine_ shift tail : _ tree = + if shift = 0 then + Leaf tail + else + Node [| build_new_tail_spine_ (shift - num_bits) tail |] + +let rec insert_tail_ (self : _ tree) shift i (tail : _ A.t) : _ tree = + Format.printf "insert tail shift=%d i=0x%x into %a@." shift i + (!dbg_ Format.pp_print_int) + (Obj.magic self : int tree); + match self with + | Empty -> + if shift = 0 then + Leaf tail + else ( + assert ((i lsl shift) land bitmask = 0); + Node [| insert_tail_ Empty (shift - num_bits) i tail |] + ) + | Leaf _ -> assert false + | Node a -> + (* would be in the {!build_new_tail_spine_} case *) + assert (i <> 0); + let idx = (i lsr shift) land bitmask in + Printf.printf "insert tail rec at idx=%d shift=%d i=0x%x\n%!" idx shift i; + let sub, must_push = + if idx < A.length a then + A.get a idx, false + else + Empty, true + in + let new_sub = insert_tail_ sub (shift - num_bits) i tail in + let a = + if must_push then + A.push a new_sub + else + A.set ~mut:false a idx new_sub + in + Node a + +let[@inline never] push_full_ self x : _ t = + if 1 lsl (self.shift + num_bits) = self.size - A.length self.tail then ( + (* tree is full, add a level *) + let t = Node [| self.t; build_new_tail_spine_ self.shift self.tail |] in + { t; size = self.size + 1; shift = self.shift + num_bits; tail = [| x |] } + ) else ( + (* insert at the end of the current tree *) + let idx = self.size - A.length self.tail in + let t = insert_tail_ self.t self.shift idx self.tail in + { t; size = self.size + 1; shift = self.shift; tail = [| x |] } + ) + +let[@inline] push (self : _ t) x : _ t = + if A.is_full self.tail then + push_full_ self x + else + { self with tail = A.push self.tail x; size = self.size + 1 } + +let rec pop_tail_from_tree_ (self : _ tree) shift i : 'a A.t * 'a tree = + Format.printf "pop tail shift=%d i=0x%x from %a@." shift i + (!dbg_ Format.pp_print_int) + (Obj.magic self : int tree); + match self with + | Empty -> assert false + | Leaf tail -> + assert (shift = 0); + tail, Empty + | Node a -> + let idx = (i lsr shift) land bitmask in + let sub = A.get a idx in + let tail, new_sub = pop_tail_from_tree_ sub (shift - num_bits) i in + let new_tree = + if is_empty_tree new_sub then ( + let a = A.pop a in + if A.is_empty a then + Empty + else + Node a + ) else + Node (A.set ~mut:false a idx new_sub) + in + tail, new_tree + +let[@inline never] move_last_leaf_to_tail (self : _ t) : _ t = + assert (A.length self.tail = 1); + if self.size = 1 then + (* back to empty *) + empty + else ( + (* before pop *) + let tail, t = pop_tail_from_tree_ self.t self.shift (self.size - 1) in + let t, shift = + match t with + | Node [| t' |] -> + (* all indices have 00000 as MSB, remove one level *) + t', self.shift - 1 + | _ -> t, self.shift + in + { tail; size = self.size - 1; shift; t } + ) + +let pop (self : 'a t) : 'a * 'a t = + if self.size = 0 then invalid_arg "pvec.pop"; + let x = A.get self.tail (A.length self.tail - 1) in + let new_tail = A.pop self.tail in + if A.is_empty new_tail then ( + let new_self = move_last_leaf_to_tail self in + x, new_self + ) else ( + let new_self = { self with size = self.size - 1; tail = new_tail } in + x, new_self + ) + +let pop_opt (self : 'a t) : ('a * 'a t) option = + if self.size = 0 then + None + else + Some (pop self) + +let rec iteri_rec_ f idx (self : _ tree) = + match self with + | Empty -> () + | Leaf a -> + for i = 0 to A.length a - 1 do + let j = idx lor i in + f j (Array.unsafe_get a i) + done + | Node a -> + for i = 0 to A.length a - 1 do + let idx = (idx lsl num_bits) lor i in + iteri_rec_ f idx (Array.unsafe_get a i) + done + +let iteri f (self : 'a t) : unit = + iteri_rec_ f 0 self.t; + let tail_off = tail_off self in + for i = 0 to A.length self.tail - 1 do + f (i + tail_off) (Array.unsafe_get self.tail i) + done + +let rec iteri_rev_rec_ f idx (self : _ tree) = + match self with + | Empty -> () + | Leaf a -> + for i = A.length a - 1 downto 0 do + let j = idx lor i in + f j (Array.unsafe_get a i) + done + | Node a -> + for i = A.length a - 1 downto 0 do + let idx = (idx lsl num_bits) lor i in + iteri_rev_rec_ f idx (Array.unsafe_get a i) + done + +let iteri_rev f (self : 'a t) : unit = + let tail_off = tail_off self in + for i = A.length self.tail - 1 downto 0 do + f (i + tail_off) (Array.unsafe_get self.tail i) + done; + iteri_rev_rec_ f (tail_off - 1) self.t + +let fold_lefti f x m = + let acc = ref x in + iteri (fun i x -> acc := f !acc i x) m; + !acc + +let foldi_rev f x m = + let acc = ref x in + iteri_rev (fun i x -> acc := f !acc i x) m; + !acc + +let iter f m = iteri (fun _ x -> f x) m +let fold_left f x m = fold_lefti (fun acc _ x -> f acc x) x m +let fold_rev f x m = foldi_rev (fun acc _ x -> f acc x) x m + +let rec map_t f (self : _ tree) : _ tree = + match self with + | Empty -> Empty + | Node a -> + let a = Array.map (map_t f) a in + Node a + | Leaf a -> Leaf (Array.map f a) + +let map f (self : _ t) : _ t = + { self with t = map_t f self.t; tail = Array.map f self.tail } + +let append a b = + if is_empty b then + a + else + fold_left push a b + +let add_list v l = List.fold_left push v l +let of_list l = add_list empty l +let to_list m = fold_rev (fun acc x -> x :: acc) [] m + +let add_iter v seq = + let v = ref v in + seq (fun x -> v := push !v x); + !v + +let of_iter s = add_iter empty s +let to_iter m yield = iteri (fun _ v -> yield v) m + +let make n x : _ t = + (* TODO: probably we can optimize that? *) + of_iter (fun k -> + for _i = 1 to n do + k x + done) + +let rec add_seq self seq = + match seq () with + | Seq.Nil -> self + | Seq.Cons (x, tl) -> add_seq (push self x) tl + +let of_seq seq = add_seq empty seq + +let to_seq self : _ Seq.t = + let rec to_seq (stack : ('a tree * int) list) () = + match stack with + | [] -> Seq.Nil + | (Empty, _) :: tl -> to_seq tl () + | (Leaf a, i) :: tl when i < Array.length a -> + Seq.Cons (A.get a i, to_seq ((Leaf a, i + 1) :: tl)) + | (Leaf _, _) :: tl -> to_seq tl () + | (Node a, i) :: tl when i < A.length a -> + to_seq ((A.get a i, 0) :: (Node a, i + 1) :: tl) () + | (Node _, _) :: tl -> to_seq tl () + in + to_seq [ self.t, 0; Leaf self.tail, 0 ] + +let choose self = + if self.size = 0 then + None + else + Some (A.get self.tail 0) + +module Private_ = struct + type 'a printer = Format.formatter -> 'a -> unit + + let fpf = Format.fprintf + + let pp_array ppx out a = + fpf out "[@[%a@]]" + (Format.pp_print_list + ~pp_sep:(fun out () -> Format.fprintf out ";@ ") + ppx) + (Array.to_list a) + + let rec debugtree ppx out (self : _ tree) : unit = + match self with + | Empty -> fpf out "()" + | Leaf a -> fpf out "leaf(%a)" (pp_array ppx) a + | Node a -> fpf out "node(%a)" (pp_array @@ debugtree ppx) a + + let debug ppx out self = + fpf out + "@[pvec {@ size: %d; shift: %d;@ @[<2>tree:@ %a@];@ @[<2>tail:@ \ + %a@]@]}" + self.size self.shift (debugtree ppx) self.t (pp_array ppx) self.tail + + let () = dbg_ := debugtree +end diff --git a/src/pvec/containers_pvec.mli b/src/pvec/containers_pvec.mli new file mode 100644 index 00000000..bee50bec --- /dev/null +++ b/src/pvec/containers_pvec.mli @@ -0,0 +1,84 @@ +(** Functional Vectors. + + These are trees with a large branching factor for logarithmic operations with + a low multiplicative factor. + + {b status: experimental} + + @since NEXT_RELEASE +*) + +type 'a iter = ('a -> unit) -> unit +type !'a t + +val empty : 'a t +(** Empty vector. *) + +val is_empty : _ t -> bool +(** Is the vector empty? *) + +val return : 'a -> 'a t +(** Single element vector. *) + +val length : _ t -> int +(** Number of elements. Constant time. *) + +val make : int -> 'a -> 'a t +(** [make n x] makes a vector with [n] copies + of the element [x] *) + +val push : 'a t -> 'a -> 'a t +(** Add element at the end. *) + +val get : 'a t -> int -> 'a +(** @raise Invalid_argument if key not present. *) + +val get_opt : 'a t -> int -> 'a option + +val pop : 'a t -> 'a * 'a t +(** Pop last element. + @raise Invalid_argument in case the vec is empty. *) + +val pop_opt : 'a t -> ('a * 'a t) option +(** Pop last element. *) + +val iter : ('a -> unit) -> 'a t -> unit + +val iteri : (int -> 'a -> unit) -> 'a t -> unit +(** Iterate on elements with their index, in increasing order. *) + +val iteri_rev : (int -> 'a -> unit) -> 'a t -> unit +(** Iterate on elements with their index, but starting from the end. *) + +val fold_left : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b +val fold_rev : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b +val fold_lefti : ('b -> int -> 'a -> 'b) -> 'b -> 'a t -> 'b + +val append : 'a t -> 'a t -> 'a t +(** [append a b] adds all elements of [b] at the end of [a]. This is + at least linear in the length of [b]. *) + +val map : ('a -> 'b) -> 'a t -> 'b t + +val choose : 'a t -> 'a option +(** Return an element. It is unspecified which one is returned. *) + +val to_list : 'a t -> 'a list +val of_list : 'a list -> 'a t +val add_list : 'a t -> 'a list -> 'a t +val add_iter : 'a t -> 'a iter -> 'a t +val of_iter : 'a iter -> 'a t +val to_iter : 'a t -> 'a iter +val add_seq : 'a t -> 'a Seq.t -> 'a t +val of_seq : 'a Seq.t -> 'a t +val to_seq : 'a t -> 'a Seq.t + +(**/**) + +module Private_ : sig + type 'a printer = Format.formatter -> 'a -> unit + + val debug : 'a printer -> 'a t printer +end + +(**/**) diff --git a/src/pvec/dune b/src/pvec/dune new file mode 100644 index 00000000..6ff76874 --- /dev/null +++ b/src/pvec/dune @@ -0,0 +1,8 @@ + +(library + (name containers_pvec) + (public_name containers.pvec) + (preprocess + (action + (run %{project_root}/src/core/cpp/cpp.exe %{input-file}))) + (synopsis "Persistent vector for OCaml"))