ocaml-containers/core/CCTrie.ml
2014-11-08 01:28:42 +01:00

555 lines
16 KiB
OCaml

(*
copyright (c) 2013-2014, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Prefix Tree} *)
type 'a sequence = ('a -> unit) -> unit
type 'a ktree = unit -> [`Nil | `Node of 'a * 'a ktree list]
(** {2 Signatures} *)
(** {6 A Composite Word}
Words are made of characters, who belong to a total order *)
module type WORD = sig
type t
type char_
val compare : char_ -> char_ -> int
val to_seq : t -> char_ sequence
val of_list : char_ list -> t
end
module type S = sig
type char_
type key
type 'a t
val empty : 'a t
val is_empty : _ t -> bool
val add : key -> 'a -> 'a t -> 'a t
(** Add a binding to the trie (possibly erasing the previous one) *)
val remove : key -> 'a t -> 'a t
(** Remove the key, if present *)
val find : key -> 'a t -> 'a option
(** Find the value associated with the key, if any *)
val find_exn : key -> 'a t -> 'a
(** Same as {!find} but can fail.
@raise Not_found if the key is not present *)
val update : key -> ('a option -> 'a option) -> 'a t -> 'a t
(** Update the binding for the given key. The function is given
[None] if the key is absent, or [Some v] if [key] is bound to [v];
if it returns [None] the key is removed, otherwise it
returns [Some y] and [key] becomes bound to [y] *)
val fold : ('b -> key -> 'a -> 'b) -> 'b -> 'a t -> 'b
(** Fold on key/value bindings. Will use {!WORD.of_list} to rebuild keys. *)
val iter : (key -> 'a -> unit) -> 'a t -> unit
(** Same as {!fold}, but for effectful functions *)
val fold_values : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b
(** More efficient version of {!fold}, that doesn't keep keys *)
val iter_values : ('a -> unit) -> 'a t -> unit
val merge : ('a -> 'a -> 'a option) -> 'a t -> 'a t -> 'a t
(** Merge two tries together. The function is used in
case of conflicts, when a key belongs to both tries *)
val size : _ t -> int
(** Number of bindings *)
(** {6 Conversions} *)
val to_list : 'a t -> (key * 'a) list
val of_list : (key * 'a) list -> 'a t
val to_seq : 'a t -> (key * 'a) sequence
val of_seq : (key * 'a) sequence -> 'a t
val to_seq_values : 'a t -> 'a sequence
val to_tree : 'a t -> [`Char of char_ | `Val of 'a | `Switch] ktree
(** {6 Ranges} *)
val above : key -> 'a t -> (key * 'a) sequence
(** All bindings whose key is bigger than (or equal to) the given key *)
val below : key -> 'a t -> (key * 'a) sequence
(** All bindings whose key is smaller or equal to the given key *)
end
module Make(W : WORD) = struct
type char_ = W.char_
type key = W.t
module M = Map.Make(struct
type t = char_
let compare = W.compare
end)
type 'a t =
| Empty
| Path of char_ list * 'a t
| Node of 'a option * 'a t M.t
(* invariants:
- for Path(l,t) l is never empty
- for Node (None,map) map always has at least 2 elements
- for Node (Some _,map) map can be anything *)
let empty = Empty
let _invariant = function
| Path ([],_) -> false
| Node (None, map) when M.is_empty map -> false
| _ -> true
let is_empty = function
| Empty -> true
| _ -> false
let _id x = x
let _fold_seq f ~finish acc seq =
let acc = ref acc in
seq (fun x -> acc := f !acc x);
finish !acc
let _filter_map_seq f seq k =
seq (fun x -> match f x with
| None -> ()
| Some y -> k y)
let _seq_append_list l seq =
let l = ref l in
seq (fun x -> l := x :: !l);
!l
let _seq_map map k =
M.iter (fun key v -> k (key,v)) map
let _is_path = function
| Path _ -> true
| _ -> false
(* return common prefix, and disjoint suffixes *)
let rec _merge_lists l1 l2 = match l1, l2 with
| [], _
| _, [] -> [], l1, l2
| c1::l1', c2::l2' ->
if W.compare c1 c2 = 0
then
let pre, rest1, rest2 = _merge_lists l1' l2' in
c1::pre, rest1, rest2
else
[], l1, l2
(* prefix [l] to the tree [t] *)
let _mk_path l t = match l, t with
| [], _ -> t
| _, Empty -> Empty
| _, Node _ -> Path (l, t)
| _, Path (l',t') ->
assert (not(_is_path t'));
Path (l@l', t')
let _mk_path_cons x t = match t with
| Empty -> Empty
| Node _ -> Path ([x], t)
| Path (l', t') ->
assert (not(_is_path t'));
Path (x::l', t')
(* build a Node value *)
let _mk_node value map = match value with
| Some _ -> Node (value, map)
| None ->
if M.is_empty map then Empty
else
let high, t' = M.max_binding map in
let low, _ = M.min_binding map in
if W.compare low high = 0
then _mk_path [high] t' (* only one element *)
else Node (value,map)
let _remove_sub c t = match t with
| Empty -> t
| Path ([], _) -> assert false
| Path (c'::_, _) ->
if W.compare c c' = 0
then Empty
else t
| Node (value, map) ->
if M.mem c map
then
let map' = M.remove c map in
_mk_node value map'
else t
let update key f t =
(* [state]: current subtree and rebuild function; [x]: current char *)
let goto (t, rebuild) c =
match t with
| Empty -> (t, fun t -> rebuild (_mk_path_cons c t))
| Path ([], _) -> assert false
| Path (c'::l, t') ->
if W.compare c c' = 0
then
(* follow the path *)
_mk_path l t', (fun t -> rebuild (_mk_path_cons c t))
else
(* exit the path, so we have an empty tree. Upon rebuild we
potentially need to make a map *)
let rebuild' new_child =
rebuild (
if is_empty new_child then t
else
let map = M.singleton c new_child in
let map = M.add c' (_mk_path l t') map in
_mk_node None map
)
in
empty, rebuild'
| Node (value, map) ->
try
let t' = M.find c map in
(* rebuild: we modify [t], so we put the new version in [map]
if it's not empty, and make the node again *)
let rebuild' new_child =
rebuild (
if is_empty new_child
then _mk_node value (M.remove c map)
else _mk_node value (M.add c new_child map)
)
in
t', rebuild'
with Not_found ->
let rebuild' new_child =
if is_empty new_child
then rebuild t (* ignore *)
else
let map' = M.add c new_child map in
rebuild (_mk_node value map')
in
empty, rebuild'
in
let finish (t,rebuild) = match t with
| Empty -> rebuild (_mk_node (f None) M.empty)
| Path ([], _) -> assert false
| Path (c::l', t') ->
rebuild (
match f None with
| None -> t (* TODO: raise exception & return original tree *)
| Some _ as v ->
_mk_node v (M.singleton c (_mk_path l' t'))
)
| Node (value, map) ->
let value' = f value in
rebuild (_mk_node value' map)
in
let word = W.to_seq key in
_fold_seq goto ~finish (t, _id) word
let add k v t = update k (fun _ -> Some v) t
let remove k t = update k (fun _ -> None) t
let find_exn k t =
(* at subtree [t], and character [c] *)
let goto t c = match t with
| Empty -> raise Not_found
| Path ([], _) -> assert false
| Path (c'::l, t') ->
if W.compare c c' = 0
then _mk_path l t'
else raise Not_found
| Node (_, map) -> M.find c map
and finish t = match t with
| Node (Some v, _) -> v
| _ -> raise Not_found
in
let word = W.to_seq k in
_fold_seq goto ~finish t word
let find k t =
try Some (find_exn k t)
with Not_found -> None
let _difflist_append f l = fun l' -> f (l @ l')
let _difflist_add f x = fun l' -> f (x :: l')
(* fold that also keeps the path from the root, so as to provide the list
of chars that lead to a value. The path is a difference list, ie
a function that prepends a list to some suffix *)
let rec _fold f path t acc = match t with
| Empty -> acc
| Path (l, t') -> _fold f (_difflist_append path l) t' acc
| Node (v, map) ->
let acc = match v with
| None -> acc
| Some v -> f acc path v
in
M.fold
(fun c t' acc -> _fold f (_difflist_add path c) t' acc)
map acc
let fold f acc t =
_fold
(fun acc path v ->
let key = W.of_list (path []) in
f acc key v
) _id t acc
let iter f t =
_fold
(fun () path y -> f (W.of_list (path [])) y)
_id t ()
let _iter_prefix ~prefix f t =
_fold
(fun () path y ->
let key = W.of_list (prefix (path [])) in
f key y)
_id t ()
let rec fold_values f acc t = match t with
| Empty -> acc
| Path (_, t') -> fold_values f acc t'
| Node (v, map) ->
let acc = match v with
| None -> acc
| Some v -> f acc v
in
M.fold
(fun _c t' acc -> fold_values f acc t')
map acc
let iter_values f t = fold_values (fun () x -> f x) () t
let rec merge f t1 t2 = match t1, t2 with
| Empty, _ -> t2
| _, Empty -> t1
| Path (l1,t1'), Path (l2,t2') ->
let common, l1', l2' = _merge_lists l1 l2 in
begin match l1', l2' with
| c1::l1'', c2::l2'' ->
(* need to build a map here, to represent the choice
between [c1] and [c2] *)
assert (W.compare c1 c2 <> 0);
let map = M.add c1 (_mk_path l1'' t1') M.empty in
let map = M.add c2 (_mk_path l2'' t2') map in
_mk_path common (Node (None, map))
| _ ->
_mk_path common
(merge f
(_mk_path l1' t1')
(_mk_path l2' t2')
)
end
| Path ([], _), _ -> assert false
| Path (c1::l1, t1'), Node (value, map) ->
begin try
(* collision *)
let t2' = M.find c1 map in
let new_t = merge f (_mk_path l1 t1') t2' in
let map' = if is_empty new_t
then M.remove c1 map
else M.add c1 new_t map
in
_mk_node value map'
with Not_found ->
(* no collision *)
assert (not(is_empty t1'));
Node (value, M.add c1 (_mk_path l1 t1') map)
end
| Node _, Path _ -> merge f t2 t1 (* previous case *)
| Node(v1, map1), Node (v2, map2) ->
let v = match v1, v2 with
| None, _ -> v2
| _, None -> v1
| Some v1, Some v2 -> f v1 v2
in
let map' = M.merge
(fun _c t1 t2 -> match t1, t2 with
| None, None -> assert false
| Some t, None
| None, Some t -> Some t
| Some t1, Some t2 ->
let new_t = merge f t1 t2 in
if is_empty new_t then None else Some new_t
) map1 map2
in
_mk_node v map'
let rec size t = match t with
| Empty -> 0
| Path (_, t') -> size t'
| Node (v, map) ->
let s = if v=None then 0 else 1 in
M.fold
(fun _ t' acc -> size t' + acc)
map s
let to_list t = fold (fun acc k v -> (k,v)::acc) [] t
let of_list l =
List.fold_left (fun acc (k,v) -> add k v acc) empty l
let to_seq t k = iter (fun key v -> k (key,v)) t
let to_seq_values t k = iter_values k t
let of_seq seq =
_fold_seq (fun acc (k,v) -> add k v acc) ~finish:_id empty seq
let rec to_tree t () =
let _tree_node x l () = `Node (x,l) in
match t with
| Empty -> `Nil
| Path ([], _) -> assert false
| Path (c::l, t') -> `Node (`Char c, [to_tree (_mk_path l t')])
| Node (v, map) ->
let x = match v with
| None -> `Switch
| Some v -> `Val v
in
let l = M.bindings map in
`Node(x, List.map (fun (c,t') -> _tree_node (`Char c) [to_tree t']) l)
(** {6 Ranges} *)
(* range above or below a threshold.
[p c c'] must return [true] if [c'], in the tree, meets some criterion
w.r.t [c] which is a part of the key. *)
let _half_range ~p key t k =
(* at subtree [cur = Some (t,trail)] or [None], alternatives above
[alternatives], and char [c] in [key]. *)
let on_char (cur, alternatives) c =
match cur with
| None -> (None, alternatives)
| Some (Empty,_) -> (None, alternatives)
| Some (Path ([], _),_) -> assert false
| Some (Path (c'::l, t'), trail) ->
if W.compare c c' = 0
then Some (_mk_path l t', _difflist_add trail c), alternatives
else None, alternatives
| Some (Node (_, map), trail) ->
let alternatives =
let seq = _seq_map map in
let seq = _filter_map_seq
(fun (c', t') -> if p c c'
then Some (t', _difflist_add trail c')
else None
) seq
in
_seq_append_list alternatives seq
in
begin try
let t' = M.find c map in
Some (t', _difflist_add trail c), alternatives
with Not_found ->
None, alternatives
end
(* run through the current path (if any) and alternatives *)
and finish (cur,alternatives) =
begin match cur with
| Some (t, prefix) ->
_iter_prefix ~prefix (fun key' v -> k (key', v)) t
| None -> ()
end;
List.iter
(fun (t,prefix) -> _iter_prefix ~prefix (fun key' v -> k (key', v)) t)
alternatives
in
let word = W.to_seq key in
_fold_seq on_char ~finish (Some(t,_id), []) word
let above key t =
_half_range ~p:(fun c c' -> W.compare c c' < 0) key t
let below key t =
_half_range ~p:(fun c c' -> W.compare c c' > 0) key t
end
module type ORDERED = sig
type t
val compare : t -> t -> int
end
module MakeArray(X : ORDERED) = Make(struct
type t = X.t array
type char_ = X.t
let compare = X.compare
let to_seq a k = Array.iter k a
let of_list = Array.of_list
end)
module MakeList(X : ORDERED) = Make(struct
type t = X.t list
type char_ = X.t
let compare = X.compare
let to_seq a k = List.iter k a
let of_list l = l
end)
module String = Make(struct
type t = string
type char_ = char
let compare = Char.compare
let to_seq s k = String.iter k s
let of_list l =
let buf = Buffer.create (List.length l) in
List.iter (fun c -> Buffer.add_char buf c) l;
Buffer.contents buf
end)
(*$T
String.of_list ["a", 1; "b", 2] |> String.size = 2
String.of_list ["a", 1; "b", 2; "a", 3] |> String.size = 2
String.of_list ["a", 1; "b", 2] |> String.find_exn "a" = 1
String.of_list ["a", 1; "b", 2] |> String.find_exn "b" = 2
String.of_list ["a", 1; "b", 2] |> String.find "c" = None
String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find_exn "cat" = 1
String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find_exn "catogan" = 2
String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find_exn "foo" = 3
String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find "cato" = None
*)