diff --git a/_oasis b/_oasis index 4c830279..60abbc9f 100644 --- a/_oasis +++ b/_oasis @@ -42,7 +42,7 @@ Library "containers" CCMultiSet, CCBV, CCPrint, CCPersistentHashtbl, CCError, CCHeap, CCList, CCOpt, CCPair, CCFun, CCHash, CCKList, CCInt, CCBool, CCArray, CCBatch, CCOrd, - CCRandom, CCLinq, CCKTree + CCRandom, CCLinq, CCKTree, CCTrie FindlibName: containers Library "containers_string" diff --git a/core/CCTrie.ml b/core/CCTrie.ml new file mode 100644 index 00000000..00926bce --- /dev/null +++ b/core/CCTrie.ml @@ -0,0 +1,443 @@ + +(* +copyright (c) 2013-2014, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Prefix Tree} *) + +type 'a sequence = ('a -> unit) -> unit +type 'a ktree = unit -> [`Nil | `Node of 'a * 'a ktree list] + +(** {2 Signatures} *) + +(** {6 A Composite Word} + +Words are made of characters, who belong to a total order *) + +module type WORD = sig + type t + type char_ + + val compare : char_ -> char_ -> int + val to_seq : t -> char_ sequence + val of_list : char_ list -> t +end + +module type S = sig + type char_ + type key + + type 'a t + + val empty : 'a t + + val is_empty : _ t -> bool + + val add : key -> 'a -> 'a t -> 'a t + (** Add a binding to the trie (possibly erasing the previous one) *) + + val remove : key -> 'a t -> 'a t + (** Remove the key, if present *) + + val find : key -> 'a t -> 'a option + (** Find the value associated with the key, if any *) + + val find_exn : key -> 'a t -> 'a + (** Same as {!find} but can fail. + @raise Not_found if the key is not present *) + + val update : key -> ('a option -> 'a option) -> 'a t -> 'a t + (** Update the binding for the given key. The function is given + [None] if the key is absent, or [Some v] if [key] is bound to [v]; + if it returns [None] the key is removed, otherwise it + returns [Some y] and [key] becomes bound to [y] *) + + val fold : ('b -> key -> 'a -> 'b) -> 'b -> 'a t -> 'b + (** Fold on key/value bindings. Will use {!WORD.of_list} to rebuild keys. *) + + val iter : (key -> 'a -> unit) -> 'a t -> unit + (** Same as {!fold}, but for effectful functions *) + + val fold_values : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b + (** More efficient version of {!fold}, that doesn't keep keys *) + + val iter_values : ('a -> unit) -> 'a t -> unit + + val merge : ('a -> 'a -> 'a option) -> 'a t -> 'a t -> 'a t + (** Merge two tries together. The function is used in + case of conflicts, when a key belongs to both tries *) + + val size : _ t -> int + (** Number of bindings *) + + (** {6 Conversions} *) + + val to_list : 'a t -> (key * 'a) list + + val of_list : (key * 'a) list -> 'a t + + val to_seq : 'a t -> (key * 'a) sequence + + val of_seq : (key * 'a) sequence -> 'a t + + val to_seq_values : 'a t -> 'a sequence + + val to_tree : 'a t -> [`Char of char_ | `Val of 'a | `Switch] ktree +end + +module Make(W : WORD) = struct + type char_ = W.char_ + type key = W.t + + module M = Map.Make(struct + type t = char_ + let compare = W.compare + end) + + type 'a t = + | Empty + | Path of char_ list * 'a t + | Node of 'a option * 'a t M.t + + (* invariants: + - for Path(l,t) l is never empty + - for Node (None,map) map always has at least 2 elements + - for Node (Some _,map) map can be anything *) + + let empty = Empty + + let _invariant = function + | Path ([],_) -> false + | Node (None, map) when M.is_empty map -> false + | _ -> true + + let is_empty = function + | Empty -> true + | _ -> false + + let _id x = x + + let _fold_seq f ~finish acc seq = + let acc = ref acc in + seq (fun x -> acc := f !acc x); + finish !acc + + let _is_path = function + | Path _ -> true + | _ -> false + + (* return common prefix, and disjoint suffixes *) + let rec _merge_lists l1 l2 = match l1, l2 with + | [], _ + | _, [] -> [], l1, l2 + | c1::l1', c2::l2' -> + if W.compare c1 c2 = 0 + then + let pre, rest1, rest2 = _merge_lists l1' l2' in + c1::pre, rest1, rest2 + else + [], l1, l2 + + (* prefix [l] to the tree [t] *) + let _mk_path l t = match l, t with + | [], _ -> t + | _, Empty -> Empty + | _, Node _ -> Path (l, t) + | _, Path (l',t') -> + assert (not(_is_path t')); + Path (l@l', t') + + let _mk_path_cons x t = match t with + | Empty -> Empty + | Node _ -> Path ([x], t) + | Path (l', t') -> + assert (not(_is_path t')); + Path (x::l', t') + + (* build a Node value *) + let _mk_node value map = match value with + | Some _ -> Node (value, map) + | None -> + if M.is_empty map then Empty + else + let high, t' = M.max_binding map in + let low, _ = M.min_binding map in + if W.compare low high = 0 + then _mk_path [high] t' (* only one element *) + else Node (value,map) + + let _remove_sub c t = match t with + | Empty -> t + | Path ([], _) -> assert false + | Path (c'::l, t') -> + if W.compare c c' = 0 + then Empty + else t + | Node (value, map) -> + if M.mem c map + then + let map' = M.remove c map in + _mk_node value map' + else t + + let update key f t = + (* [state]: current subtree and rebuild function; [x]: current char *) + let goto (t, rebuild) c = + match t with + | Empty -> (t, fun t -> rebuild (_mk_path_cons c t)) + | Path ([], _) -> assert false + | Path (c'::l, t') -> + if W.compare c c' = 0 + then + (* follow the path *) + _mk_path l t', (fun t -> rebuild (_mk_path_cons c t)) + else + (* exit the path, so we have an empty tree. Upon rebuild we + potentially need to make a map *) + let rebuild' new_child = + rebuild ( + if is_empty new_child then t + else + let map = M.singleton c new_child in + let map = M.add c' (_mk_path l t') map in + _mk_node None map + ) + in + empty, rebuild' + | Node (value, map) -> + try + let t' = M.find c map in + (* rebuild: we modify [t], so we put the new version in [map] + if it's not empty, and make the node again *) + let rebuild' new_child = + rebuild ( + if is_empty new_child + then _mk_node value (M.remove c map) + else _mk_node value (M.add c new_child map) + ) + in + t', rebuild' + with Not_found -> + let rebuild' new_child = + if is_empty new_child + then rebuild t (* ignore *) + else + let map' = M.add c new_child map in + rebuild (_mk_node value map') + in + empty, rebuild' + in + let finish (t,rebuild) = match t with + | Empty -> rebuild (_mk_node (f None) M.empty) + | Path ([], _) -> assert false + | Path (c::l', t') -> + rebuild ( + match f None with + | None -> t (* TODO: raise exception & return original tree *) + | Some _ as v -> + _mk_node v (M.singleton c (_mk_path l' t')) + ) + | Node (value, map) -> + let value' = f value in + rebuild (_mk_node value' map) + in + let word = W.to_seq key in + _fold_seq goto ~finish (t, _id) word + + let add k v t = update k (fun _ -> Some v) t + + let remove k t = update k (fun _ -> None) t + + let find_exn k t = + (* at subtree [t], and character [c] *) + let goto t c = match t with + | Empty -> raise Not_found + | Path ([], _) -> assert false + | Path (c'::l, t') -> + if W.compare c c' = 0 + then _mk_path l t' + else raise Not_found + | Node (_, map) -> M.find c map + and finish t = match t with + | Node (Some v, _) -> v + | _ -> raise Not_found + in + let word = W.to_seq k in + _fold_seq goto ~finish t word + + let find k t = + try Some (find_exn k t) + with Not_found -> None + + let _difflist_append f l = fun l' -> f (l @ l') + let _difflist_add f x = fun l' -> f (x :: l') + + let fold f acc t = + (* also keep the path from the root, so as to provide the list + of chars that lead to a value. The path is a difference list, ie + a function that prepends a list to some suffix *) + let rec aux path t acc = match t with + | Empty -> acc + | Path (l, t') -> aux (_difflist_append path l) t' acc + | Node (v, map) -> + let acc = match v with + | None -> acc + | Some v -> f acc (W.of_list (path [])) v + in + M.fold + (fun c t' acc -> aux (_difflist_add path c) t' acc) + map acc + in aux _id t acc + + let iter f t = fold (fun _ x y -> f x y) () t + + let rec fold_values f acc t = match t with + | Empty -> acc + | Path (_, t') -> fold_values f acc t' + | Node (v, map) -> + let acc = match v with + | None -> acc + | Some v -> f acc v + in + M.fold + (fun c t' acc -> fold_values f acc t') + map acc + + let iter_values f t = fold_values (fun () x -> f x) () t + + let rec merge f t1 t2 = match t1, t2 with + | Empty, _ -> t2 + | _, Empty -> t1 + | Path (l1,t1'), Path (l2,t2') -> + let common, l1', l2' = _merge_lists l1 l2 in + begin match l1', l2' with + | c1::l1'', c2::l2'' -> + (* need to build a map here, to represent the choice + between [c1] and [c2] *) + assert (W.compare c1 c2 <> 0); + let map = M.add c1 (_mk_path l1'' t1') M.empty in + let map = M.add c2 (_mk_path l2'' t2') map in + _mk_path common (Node (None, map)) + | _ -> + _mk_path common + (merge f + (_mk_path l1' t1') + (_mk_path l2' t2') + ) + end + | Path ([], _), _ -> assert false + | Path (c1::l1, t1'), Node (value, map) -> + begin try + (* collision *) + let t2' = M.find c1 map in + let new_t = merge f (_mk_path l1 t1') t2' in + let map' = if is_empty new_t + then M.remove c1 map + else M.add c1 new_t map + in + _mk_node value map' + with Not_found -> + (* no collision *) + assert (not(is_empty t1')); + Node (value, M.add c1 (_mk_path l1 t1') map) + end + | Node _, Path _ -> merge f t2 t1 (* previous case *) + | Node(v1, map1), Node (v2, map2) -> + let v = match v1, v2 with + | None, _ -> v2 + | _, None -> v1 + | Some v1, Some v2 -> f v1 v2 + in + let map' = M.merge + (fun _c t1 t2 -> match t1, t2 with + | None, None -> assert false + | Some t, None + | None, Some t -> Some t + | Some t1, Some t2 -> + let new_t = merge f t1 t2 in + if is_empty new_t then None else Some new_t + ) map1 map2 + in + _mk_node v map' + + let rec size t = match t with + | Empty -> 0 + | Path (_, t') -> size t' + | Node (v, map) -> + let s = if v=None then 0 else 1 in + M.fold + (fun _ t' acc -> size t' + acc) + map s + + let to_list t = fold (fun acc k v -> (k,v)::acc) [] t + + let of_list l = + List.fold_left (fun acc (k,v) -> add k v acc) empty l + + let to_seq t k = iter (fun key v -> k (key,v)) t + + let to_seq_values t k = iter_values k t + + let of_seq seq = + _fold_seq (fun acc (k,v) -> add k v acc) ~finish:_id empty seq + + let rec to_tree t () = + let _tree_node x l () = `Node (x,l) in + match t with + | Empty -> `Nil + | Path ([], _) -> assert false + | Path (c::l, t') -> `Node (`Char c, [to_tree (_mk_path l t')]) + | Node (v, map) -> + let x = match v with + | None -> `Switch + | Some v -> `Val v + in + let l = M.bindings map in + `Node(x, List.map (fun (c,t') -> _tree_node (`Char c) [to_tree t']) l) +end + +module String = Make(struct + type t = string + type char_ = char + let compare = Char.compare + let to_seq s k = String.iter k s + let of_list l = + let s = String.create (List.length l) in + List.iteri (fun i c -> s.[i] <- c) l; + s +end) + +(*$T + String.of_list ["a", 1; "b", 2] |> String.size = 2 + String.of_list ["a", 1; "b", 2; "a", 3] |> String.size = 2 + String.of_list ["a", 1; "b", 2] |> String.find_exn "a" = 1 + String.of_list ["a", 1; "b", 2] |> String.find_exn "b" = 2 + String.of_list ["a", 1; "b", 2] |> String.find "c" = None + + String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find_exn "cat" = 1 + String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find_exn "catogan" = 2 + String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find_exn "foo" = 3 + String.of_list ["cat", 1; "catogan", 2; "foo", 3] |> String.find "cato" = None +*) + diff --git a/core/CCTrie.mli b/core/CCTrie.mli new file mode 100644 index 00000000..38c4a479 --- /dev/null +++ b/core/CCTrie.mli @@ -0,0 +1,113 @@ + +(* +copyright (c) 2013-2014, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Prefix Tree} *) + +type 'a sequence = ('a -> unit) -> unit +type 'a ktree = unit -> [`Nil | `Node of 'a * 'a ktree list] + +(** {2 Signatures} *) + +(** {6 A Composite Word} + +Words are made of characters, who belong to a total order *) + +module type WORD = sig + type t + type char_ + + val compare : char_ -> char_ -> int + val to_seq : t -> char_ sequence + val of_list : char_ list -> t +end + +module type S = sig + type char_ + type key + + type 'a t + + val empty : 'a t + + val is_empty : _ t -> bool + + val add : key -> 'a -> 'a t -> 'a t + (** Add a binding to the trie (possibly erasing the previous one) *) + + val remove : key -> 'a t -> 'a t + (** Remove the key, if present *) + + val find : key -> 'a t -> 'a option + (** Find the value associated with the key, if any *) + + val find_exn : key -> 'a t -> 'a + (** Same as {!find} but can fail. + @raise Not_found if the key is not present *) + + val update : key -> ('a option -> 'a option) -> 'a t -> 'a t + (** Update the binding for the given key. The function is given + [None] if the key is absent, or [Some v] if [key] is bound to [v]; + if it returns [None] the key is removed, otherwise it + returns [Some y] and [key] becomes bound to [y] *) + + val fold : ('b -> key -> 'a -> 'b) -> 'b -> 'a t -> 'b + (** Fold on key/value bindings. Will use {!WORD.of_list} to rebuild keys. *) + + val iter : (key -> 'a -> unit) -> 'a t -> unit + (** Same as {!fold}, but for effectful functions *) + + val fold_values : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b + (** More efficient version of {!fold}, that doesn't keep keys *) + + val iter_values : ('a -> unit) -> 'a t -> unit + + val merge : ('a -> 'a -> 'a option) -> 'a t -> 'a t -> 'a t + (** Merge two tries together. The function is used in + case of conflicts, when a key belongs to both tries *) + + val size : _ t -> int + (** Number of bindings *) + + (** {6 Conversions} *) + + val to_list : 'a t -> (key * 'a) list + + val of_list : (key * 'a) list -> 'a t + + val to_seq : 'a t -> (key * 'a) sequence + + val of_seq : (key * 'a) sequence -> 'a t + + val to_seq_values : 'a t -> 'a sequence + + val to_tree : 'a t -> [`Char of char_ | `Val of 'a | `Switch] ktree +end + +(** {2 Implementation} *) + +module Make(W : WORD) : S with type key = W.t and type char_ = W.char_ + +module String : S with type key = string and type char_ = char