diff --git a/README.md b/README.md index c6d66cc4..65b4e697 100644 --- a/README.md +++ b/README.md @@ -115,9 +115,11 @@ Documentation [here](http://cedeela.fr/~simon/software/containers). ### Containers.data +- `CCBitField`, bitfields embedded in integers - `CCCache`, memoization caches, LRU, etc. - `CCFlatHashtbl`, a flat (open-addressing) hashtable functorial implementation - `CCTrie`, a prefix tree +- `CCHashTrie`, a map where keys are hashed and put in a trie by hash - `CCMultimap` and `CCMultiset`, functors defining persistent structures - `CCFQueue`, a purely functional double-ended queue structure - `CCBV`, mutable bitvectors diff --git a/_oasis b/_oasis index db6a5c42..327f5a79 100644 --- a/_oasis +++ b/_oasis @@ -84,7 +84,8 @@ Library "containers_data" Modules: CCMultiMap, CCMultiSet, CCTrie, CCFlatHashtbl, CCCache, CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl, CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray, - CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField + CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField, + CCHashTrie BuildDepends: bytes FindlibParent: containers FindlibName: data diff --git a/benchs/run_benchs.ml b/benchs/run_benchs.ml index cb7161df..f2add997 100644 --- a/benchs/run_benchs.ml +++ b/benchs/run_benchs.ml @@ -198,6 +198,12 @@ module Tbl = struct let hash i = i end) + module IHashTrie = CCHashTrie.Make(struct + type t = int + let equal (i:int) j = i=j + let hash i = i land max_int + end) + let phashtbl_add n = let h = PHashtbl.create 50 in for i = n downto 0 do @@ -240,6 +246,13 @@ module Tbl = struct done; !h + let hashtrie_add n = + let h = ref IHashTrie.empty in + for i = n downto 0 do + h := IHashTrie.add i i !h; + done; + !h + let icchashtbl_add n = let h = ICCHashtbl.create 50 in for i = n downto 0 do @@ -256,6 +269,7 @@ module Tbl = struct "imap_add", (fun n -> ignore (imap_add n)), n; "intmap_add", (fun n -> ignore (intmap_add n)), n; "ccflathashtbl_add", (fun n -> ignore (icchashtbl_add n)), n; + "cchashtrie_add", (fun n -> ignore (hashtrie_add n)), n; ] let phashtbl_replace n = @@ -318,6 +332,16 @@ module Tbl = struct done; !h + let hashtrie_replace n = + let h = ref IHashTrie.empty in + for i = 0 to n do + h := IHashTrie.add i i !h; + done; + for i = n downto 0 do + h := IHashTrie.add i i !h; + done; + !h + let icchashtbl_replace n = let h = ICCHashtbl.create 50 in for i = 0 to n do @@ -337,6 +361,7 @@ module Tbl = struct "imap_replace", (fun n -> ignore (imap_replace n)), n; "intmap_replace", (fun n -> ignore (intmap_replace n)), n; "ccflathashtbl_replace", (fun n -> ignore (icchashtbl_replace n)), n; + "hashtrie_replace", (fun n -> ignore (hashtrie_replace n)), n; ] let phashtbl_find h = @@ -387,6 +412,12 @@ module Tbl = struct ignore (CCIntMap.find i m); done + let hashtrie_find m = + fun n -> + for i = 0 to n-1 do + ignore (IHashTrie.get_exn i m); + done + let icchashtbl_find m = fun n -> for i = 0 to n-1 do @@ -403,6 +434,7 @@ module Tbl = struct let m = imap_add n in let m' = intmap_add n in let h'''''' = icchashtbl_add n in + let ht = hashtrie_add n in B.throughputN 3 [ "phashtbl_find", (fun () -> phashtbl_find h n), (); "hashtbl_find", (fun () -> hashtbl_find h' n), (); @@ -413,6 +445,7 @@ module Tbl = struct "imap_find", (fun () -> imap_find m n), (); "intmap_find", (fun () -> intmap_find m' n), (); "ccflathashtbl_find", (fun () -> icchashtbl_find h'''''' n), (); + "hashtrie_find", (fun () -> hashtrie_find ht n), (); ] let () = B.Tree.register ( diff --git a/doc/intro.txt b/doc/intro.txt index 922d9d36..a8371d79 100644 --- a/doc/intro.txt +++ b/doc/intro.txt @@ -69,6 +69,7 @@ CCCache CCFQueue CCFlatHashtbl CCHashSet +CCHashTrie CCIntMap CCMixmap CCMixset diff --git a/src/data/CCHashTrie.ml b/src/data/CCHashTrie.ml new file mode 100644 index 00000000..daa457d5 --- /dev/null +++ b/src/data/CCHashTrie.ml @@ -0,0 +1,334 @@ + +(* This file is free software, part of containers. See file "license" for more details. *) + +(** {1 Hash Tries} *) + +type 'a sequence = ('a -> unit) -> unit +type 'a printer = Format.formatter -> 'a -> unit +type 'a ktree = unit -> [`Nil | `Node of 'a * 'a ktree list] + +(** {2 Fixed-Size Arrays} *) +module type FIXED_ARRAY = sig + type 'a t + val create : 'a -> 'a t + val length : int + val get : 'a t -> int -> 'a + val set : 'a t -> int -> 'a -> 'a t + val iter : ('a -> unit) -> 'a t -> unit + val fold : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b + val for_all : ('a -> bool) -> 'a t -> bool +end + +(* TODO: add an "update" function? *) + +module type S = sig + module A : FIXED_ARRAY + + type key + + type 'a t + + val empty : 'a t + + val is_empty : _ t -> bool + + val singleton : key -> 'a -> 'a t + + val add : key -> 'a -> 'a t -> 'a t + + val get : key -> 'a t -> 'a option + + val get_exn : key -> 'a t -> 'a + (** @raise Not_found if key not present *) + + val remove : key -> 'a t -> 'a t + + val cardinal : _ t -> int + + val iter : (key -> 'a -> unit) -> 'a t -> unit + + val fold : ('b -> key -> 'a -> 'b) -> 'b -> 'a t -> 'b + + val to_list : 'a t -> (key * 'a) list + + val add_list : 'a t -> (key * 'a) list -> 'a t + + val of_list : (key * 'a) list -> 'a t + + val print : key printer -> 'a printer -> 'a t printer + + val as_tree : 'a t -> [`L of int * (key * 'a) list | `N ] ktree +end + +module type KEY = sig + type t + val equal : t -> t -> bool + val hash : t -> int +end + +(** {2 Arrays} *) +module A8 : FIXED_ARRAY = struct + type 'a t = { + a0 : 'a; + a1 : 'a; + a2 : 'a; + a3 : 'a; + a4 : 'a; + a5 : 'a; + a6 : 'a; + a7 : 'a; + } + + let create x = {a0=x; a1=x; a2=x; a3=x; a4=x; a5=x; a6=x;a7=x} + + let length = 8 + + let get a i = match i with + | 0 -> a.a0 + | 1 -> a.a1 + | 2 -> a.a2 + | 3 -> a.a3 + | 4 -> a.a4 + | 5 -> a.a5 + | 6 -> a.a6 + | 7 -> a.a7 + | _ -> invalid_arg "A8.get" + + let set a i x = match i with + | 0 -> {a with a0=x} + | 1 -> {a with a1=x} + | 2 -> {a with a2=x} + | 3 -> {a with a3=x} + | 4 -> {a with a4=x} + | 5 -> {a with a5=x} + | 6 -> {a with a6=x} + | 7 -> {a with a7=x} + | _ -> invalid_arg "A8.set" + + let iter f a = + f a.a0; + f a.a1; + f a.a2; + f a.a3; + f a.a4; + f a.a5; + f a.a6; + f a.a7; + () + + let fold f acc a = + let acc = f acc a.a0 in + let acc = f acc a.a1 in + let acc = f acc a.a2 in + let acc = f acc a.a3 in + let acc = f acc a.a4 in + let acc = f acc a.a5 in + let acc = f acc a.a6 in + let acc = f acc a.a7 in + acc + + let for_all p a = + p a.a0 && + p a.a1 && + p a.a2 && + p a.a3 && + p a.a4 && + p a.a5 && + p a.a6 && + p a.a7 +end + +(** {2 Functors} *) + +module Hash : sig + type t = private int + val make_unsafe : int -> t + val rem : t -> int (* 3 last bits *) + val quotient : t -> t (* remove 3 last bits *) + val combine : t -> int -> t (* add 3 last bits *) +end = struct + type t = int + let make_unsafe i = i + let rem h = h land 7 + let quotient h = h lsr 3 + let combine h r = h lsl 3 lor r + + (* safety checks *) + let () = + assert ( + List.for_all + (fun n -> + let q = quotient n and r = rem n in + n = combine q r + ) [1;2;3;4;10;205;295;4262;1515;67;8;99;224;] + ) +end + +module Make(Key : KEY) +: S with module A = A8 and type key = Key.t += struct + module A = A8 + + let () = assert (A.length = 8) + + let hash_ x = Hash.make_unsafe (Key.hash x) + + type key = Key.t + + (* association list, without duplicates *) + type 'a leaf = + | Nil + | Cons of key * 'a * 'a leaf + + type 'a t = + | E + | L of Hash.t * 'a leaf (* same hash for all elements *) + | N of 'a t A.t + + (* invariants: + L [] --> E + N [E, E,...., E] -> E + *) + + let empty = E + + let is_empty = function + | E -> true + | L (_, Nil) -> assert false + | L _ + | N _ -> false + + let leaf_ k v ~h = L (h, Cons(k,v,Nil)) + + let singleton k v = leaf_ k v ~h:(hash_ k) + + let rec get_exn_list_ k l = match l with + | Nil -> raise Not_found + | Cons (k', v', tail) -> + if Key.equal k k' then v' else get_exn_list_ k tail + + let rec get_exn_ k ~h m = match m with + | E -> raise Not_found + | L (_, l) -> get_exn_list_ k l + | N a -> + let i = Hash.rem h in + let h' = Hash.quotient h in + get_exn_ k ~h:h' (A.get a i) + + let get_exn k m = get_exn_ k ~h:(hash_ k) m + + let get k m = + try Some (get_exn_ k ~h:(hash_ k) m) + with Not_found -> None + + (* TODO: use Hash.combine if array only has one non-empty element *) + + (* [h]: hash, with the part required to reach this leaf removed *) + let rec add_ k v ~h m = match m with + | E -> leaf_ k v ~h + | L (h', l) -> + if h=h' + then L (h, add_list_ k v ~h l) + else (* split into N *) + let a = A.create E in + (* put leaf in the right bucket *) + let i = Hash.rem h' in + let h'' = Hash.quotient h' in + let a = A.set a i (L (h'', l)) in + (* then add new node *) + let a = add_to_array_ k v ~h a in + N a + | N a -> N (add_to_array_ k v ~h a) + + (* [left] list nodes already visited *) + and add_list_ k v ~h l = match l with + | Nil -> Cons (k, v, Nil) + | Cons (k', v', tail) -> + if Key.equal k k' + then Cons (k, v, tail) (* replace *) + else Cons (k', v', add_list_ k v ~h tail) + + (* add k->v to [a] *) + and add_to_array_ k v ~h a = + (* insert in a bucket *) + let i = Hash.rem h in + let h' = Hash.quotient h in + A.set a i (add_ k v ~h:h' (A.get a i)) + + let add k v m = add_ k v ~h:(hash_ k) m + + let is_empty_arr_ a = A.for_all is_empty a + + let rec remove_list_ k l = match l with + | Nil -> Nil + | Cons (k', v', tail) -> + if Key.equal k k' + then tail + else Cons (k', v', remove_list_ k tail) + + let rec remove_rec_ k ~h m = match m with + | E -> E + | L (h, l) -> + begin match remove_list_ k l with + | Nil -> E + | Cons _ as res -> L (h, res) + end + | N a -> + let i = Hash.rem h in + let h' = Hash.quotient h in + let a' = A.set a i (remove_rec_ k ~h:h' (A.get a i)) in + if is_empty_arr_ a' + then E + else N a' + + let remove k m = remove_rec_ k ~h:(hash_ k) m + + let iter f t = + let rec aux = function + | E -> () + | L (_,l) -> aux_list l + | N a -> A.iter aux a + and aux_list = function + | Nil -> () + | Cons (k, v, tl) -> f k v; aux_list tl + in + aux t + + let fold f acc t = + let rec aux acc t = match t with + | E -> acc + | L (_,l) -> aux_list acc l + | N a -> A.fold aux acc a + and aux_list acc l = match l with + | Nil -> acc + | Cons (k, v, tl) -> let acc = f acc k v in aux_list acc tl + in + aux acc t + + let cardinal m = fold (fun n _ _ -> n+1) 0 m + + let to_list m = fold (fun acc k v -> (k,v)::acc) [] m + + let add_list m l = List.fold_left (fun acc (k,v) -> add k v acc) m l + + let of_list l = add_list empty l + + let print ppk ppv out m = + let first = ref true in + iter + (fun k v -> + if !first then first := false else Format.fprintf out ";@ "; + ppk out k; + Format.pp_print_string out " -> "; + ppv out v + ) m + + let rec as_tree m () = match m with + | E -> `Nil + | L (h,l) -> `Node (`L ((h:>int), list_as_tree_ l), []) + | N a -> `Node (`N, array_as_tree_ a) + and list_as_tree_ l = match l with + | Nil -> [] + | Cons (k, v, tail) -> (k,v) :: list_as_tree_ tail + and array_as_tree_ a = A.fold (fun acc t -> as_tree t :: acc) [] a +end diff --git a/src/data/CCHashTrie.mli b/src/data/CCHashTrie.mli new file mode 100644 index 00000000..79dd794e --- /dev/null +++ b/src/data/CCHashTrie.mli @@ -0,0 +1,83 @@ + +(* This file is free software, part of containers. See file "license" for more details. *) + +(** {1 Hash Tries} + + Trie indexed by the hash of the keys, where the branching factor is fixed. + The goal is to have a quite efficient functional structure with fast + update and access {b if} the hash function is good. + The trie is not binary, to improve cache locality and decrease depth. + + {b status: experimental} + + @since NEXT_RELEASE +*) + +type 'a sequence = ('a -> unit) -> unit +type 'a printer = Format.formatter -> 'a -> unit +type 'a ktree = unit -> [`Nil | `Node of 'a * 'a ktree list] + +(** {2 Fixed-Size Arrays} *) +module type FIXED_ARRAY = sig + type 'a t + val create : 'a -> 'a t + val length : int + val get : 'a t -> int -> 'a + val set : 'a t -> int -> 'a -> 'a t + val iter : ('a -> unit) -> 'a t -> unit + val fold : ('b -> 'a -> 'b) -> 'b -> 'a t -> 'b + val for_all : ('a -> bool) -> 'a t -> bool +end + +(** {2 Signature} *) +module type S = sig + module A : FIXED_ARRAY + + type key + + type 'a t + + val empty : 'a t + + val is_empty : _ t -> bool + + val singleton : key -> 'a -> 'a t + + val add : key -> 'a -> 'a t -> 'a t + + val get : key -> 'a t -> 'a option + + val get_exn : key -> 'a t -> 'a + (** @raise Not_found if key not present *) + + val remove : key -> 'a t -> 'a t + + val cardinal : _ t -> int + + val iter : (key -> 'a -> unit) -> 'a t -> unit + + val fold : ('b -> key -> 'a -> 'b) -> 'b -> 'a t -> 'b + + val to_list : 'a t -> (key * 'a) list + + val add_list : 'a t -> (key * 'a) list -> 'a t + + val of_list : (key * 'a) list -> 'a t + + val print : key printer -> 'a printer -> 'a t printer + + val as_tree : 'a t -> [`L of int * (key * 'a) list | `N ] ktree + (** For debugging purpose: explore the structure of the tree, + with [`L (h,l)] being a leaf (with shared hash [h]) + and [`N] an inner node *) +end + +(** {2 Type for keys} *) +module type KEY = sig + type t + val equal : t -> t -> bool + val hash : t -> int +end + +(** {2 Functors} *) +module Make(K : KEY) : S with type key = K.t