From 3d8adbaf0925fd8f90cdd0d827c2ec401bcca105 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Thu, 4 Jun 2015 20:45:40 +0200 Subject: [PATCH] add `CCHashconsedSet` in `containers.data` (set with maximal struct sharing) --- _oasis | 2 +- src/data/CCHashconsedSet.ml | 415 +++++++++++++++++++++++++++++++++++ src/data/CCHashconsedSet.mli | 103 +++++++++ 3 files changed, 519 insertions(+), 1 deletion(-) create mode 100644 src/data/CCHashconsedSet.ml create mode 100644 src/data/CCHashconsedSet.mli diff --git a/_oasis b/_oasis index 05e60867..61093a89 100644 --- a/_oasis +++ b/_oasis @@ -83,7 +83,7 @@ Library "containers_data" Modules: CCMultiMap, CCMultiSet, CCTrie, CCFlatHashtbl, CCCache, CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl, CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray, - CCMixset + CCMixset, CCHashconsedSet BuildDepends: bytes FindlibParent: containers FindlibName: data diff --git a/src/data/CCHashconsedSet.ml b/src/data/CCHashconsedSet.ml new file mode 100644 index 00000000..da55cc65 --- /dev/null +++ b/src/data/CCHashconsedSet.ml @@ -0,0 +1,415 @@ + +(* +copyright (c) 2013-2015, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Hashconsed Sets} *) + + +(* uses "Fast Mergeable Integer Maps", Okasaki & Gill, as a hash tree. +We use big-endian trees. *) + +module type ELT = sig + type t + + val compare : t -> t -> int + (** Total order *) + + val hash : t -> int + (** Deterministic *) +end + +module type S = sig + type elt + + type t + (** Set of elements *) + + val empty : t + + val singleton : elt -> t + + val doubleton : elt -> elt -> t + + val mem : elt -> t -> bool + + val equal : t -> t -> bool + (** Fast equality test [O(1)] *) + + val compare : t -> t -> int + (** Fast (arbitrary) comparisontest [O(1)] *) + + val add : elt -> t -> t + + val remove : elt -> t -> t + + val cardinal : t -> int + + val iter : (elt -> unit) -> t -> unit + (** Iterate on elements, in no particular order *) + + val fold : (elt -> 'a -> 'a) -> t -> 'a -> 'a + (** fold on elements, in arbitrary order *) + + val choose : t -> elt option + + val choose_exn : t -> elt + + val union : t -> t -> t + + val inter : t -> t -> t + + (** {2 Whole-collection operations} *) + + type 'a sequence = ('a -> unit) -> unit + type 'a gen = unit -> 'a option + + val add_list : t -> elt list -> t + + val of_list : elt list -> t + + val to_list : t -> elt list + + val add_seq : t -> elt sequence -> t + + val of_seq : elt sequence -> t + + val to_seq : t -> elt sequence +end + +module Make(E : ELT) : S with type elt = E.t = struct + type elt = E.t + + type t = { + cell: cell; + id: int; (* unique hashconsing ID *) + } + and cell = + | E (* empty *) + | L of int * elt list (* leaf: sorted list of elements *) + | N of int (* common prefix *) * int (* bit switch *) * t * t + + let rec eq_list_ l1 l2 = match l1, l2 with + | [], [] -> true + | [], _ + | _, [] -> false + | x1 :: tl1, x2 :: tl2 -> + E.compare x1 x2 = 0 && eq_list_ tl1 tl2 + + let hash_pair_ a b = Hashtbl.hash (a,b) + let hash_quad_ a b c d = Hashtbl.hash (a,b,c,d) + + let rec hash_list_ l = match l with + | [] -> 0xf00d + | x :: tl -> hash_pair_ x (hash_list_ tl) + + (* hashconsing table *) + module Tbl = Weak.Make(struct + type t_ = t + type t = t_ + let equal t1 t2 = match t1.cell, t2.cell with + | E, E -> true + | L (k1, l1), L (k2, l2) -> k1==k2 && eq_list_ l1 l2 + | N (a1, b1, l1, r1), N (a2, b2, l2, r2) -> + a1==a2 && b1==b2 && l1.id == l2.id && r1.id == r2.id + | E, _ + | L _, _ + | N _, _ -> false + let hash t = match t.cell with + | E -> 42 + | L (k, l) -> hash_pair_ k (hash_list_ l) + | N (a, b, l, r) -> + hash_quad_ a b l.id r.id + end) + + let table_ = Tbl.create 4096 + let id_ = ref 1 + + (* make a node out of a cell, with hashconsing *) + let hashcons_ cell = + let n = {cell; id= !id_} in + let n' = Tbl.merge table_ n in + if n==n' then incr id_; + n' + + (* empty tree *) + let empty = hashcons_ E + + let bit_is_0_ x ~bit = x land bit = 0 + + let mask_ x ~mask = (x lor (mask -1)) land (lnot mask) + (* low endian: let mask_ x ~mask = x land (mask - 1) *) + + let is_prefix_ ~prefix y ~bit = prefix = mask_ y ~mask:bit + + (* loop down until x=lowest_bit_ x *) + let rec highest_bit_naive x m = + if m = 0 then 0 + else if x land m = 0 then highest_bit_naive x (m lsr 1) + else m + + let highest_bit_ = + (* the highest representable 2^n *) + let max_log = 1 lsl (Sys.word_size - 2) in + fun x -> + if x > 1 lsl 20 + then (* small shortcut: remove least significant 20 bits *) + let x' = x land (lnot ((1 lsl 20) -1)) in + highest_bit_naive x' max_log + else highest_bit_naive x max_log + + let branching_bit_ a b = highest_bit_ (a lxor b) + + let rec list_mem_ x l = match l with + | [] -> false + | y :: tl -> + match E.compare x y with + | 0 -> true + | c when c > 0 -> list_mem_ x tl + | _ -> false (* [x] cannot be in the tail, all elements are larger *) + + let rec mem_rec_ k x t = match t.cell with + | E -> false + | L (k', l) when k = k' -> + list_mem_ x l + | L _ -> false + | N (prefix, m, l, r) -> + if is_prefix_ ~prefix k ~bit:m + then if bit_is_0_ k ~bit:m + then mem_rec_ k x l + else mem_rec_ k x r + else raise Not_found + + let equal t1 t2 = t1.id = t2.id + + let compare t1 t2 = Pervasives.compare t1.id t2.id + + let mem x t = mem_rec_ (E.hash x) x t + + let mk_node_ prefix switch l r = match l.cell, r.cell with + | E, _ -> r + | _, E -> l + | _ -> hashcons_ (N (prefix, switch, l, r)) + + let mk_leaf_ hash l = match l with + | [] -> empty + | _::_ -> hashcons_ (L (hash, l)) + + (* join trees t1 and t2 with prefix p1 and p2 respectively + (p1 and p2 do not overlap) *) + let join_ t1 p1 t2 p2 = + let switch = branching_bit_ p1 p2 in + let prefix = mask_ p1 ~mask:switch in + if bit_is_0_ p1 ~bit:switch + then mk_node_ prefix switch t1 t2 + else (assert (bit_is_0_ p2 ~bit:switch); mk_node_ prefix switch t2 t1) + + let singleton_ k x = hashcons_ (L (k, [x])) + + let singleton x = singleton_ (E.hash x) x + + (* insert [x] in [l], keeping [l] sorted *) + let rec insert_list_ x l = match l with + | [] -> [x] + | y :: tl -> + match E.compare x y with + | 0 -> l (* already in there *) + | c when c<0 -> + (* x y :: insert_list_ x tl + + let rec add_rec_ k x t = match t.cell with + | E -> hashcons_ (L (k, [x])) + | L (k', l) -> + if k=k' + then hashcons_ (L (k, insert_list_ x l)) + else join_ t k' (singleton_ k x) k + | N (prefix, switch, l, r) -> + if is_prefix_ ~prefix k ~bit:switch + then if bit_is_0_ k ~bit:switch + then hashcons_ (N(prefix, switch, add_rec_ k x l, r)) + else hashcons_ (N(prefix, switch, l, add_rec_ k x r)) + else join_ (singleton_ k x) k t prefix + + let add x t = add_rec_ (E.hash x) x t + + (*$Q & ~count:20 + Q.(list int) (fun l -> \ + let module S = Make(CCInt) in \ + let m = S.of_list l in \ + List.for_all (fun x -> S.mem x m) l) + *) + + let rec remove_list_ x l = match l with + | [] -> [] + | y :: tl -> + match E.compare x y with + | 0 -> tl (* eliminate *) + | c when c<0 -> l (* cannot be in [l] *) + | _ -> y :: remove_list_ x tl + + let rec remove_rec_ k x t = match t.cell with + | E -> empty + | L (k', l) when k=k' -> + mk_leaf_ k (remove_list_ x l) + | L _ -> t (* preserve *) + | N (prefix, switch, l, r) -> + if is_prefix_ ~prefix k ~bit:switch + then if bit_is_0_ k ~bit:switch + then mk_node_ prefix switch (remove_rec_ k x l) r + else mk_node_ prefix switch l (remove_rec_ k x r) + else t (* not present *) + + let remove x l = remove_rec_ (E.hash x) x l + + let doubleton v1 v2 = add v1 (singleton v2) + + let rec iter f t = match t.cell with + | E -> () + | L (_, v) -> List.iter f v + | N (_, _, l, r) -> iter f l; iter f r + + let rec fold f t acc = match t.cell with + | E -> acc + | L (_, l) -> List.fold_right f l acc + | N (_, _, l, r) -> + let acc = fold f l acc in + fold f r acc + + let cardinal t = fold (fun _ n -> n+1) t 0 + + let rec choose_exn t = match t.cell with + | E -> raise Not_found + | L (_, []) -> assert false + | L (_, x :: _) -> x + | N (_, _, l, _) -> choose_exn l + + let choose t = + try Some (choose_exn t) + with Not_found -> None + + let rec union_list_ l1 l2 = match l1, l2 with + | [], _ -> l2 + | _, [] -> l1 + | x1 :: tl1, x2 :: tl2 -> + match E.compare x1 x2 with + | 0 -> x1 :: union_list_ tl1 tl2 + | c when c<0 -> x1 :: union_list_ tl1 l2 + | _ -> x2 :: union_list_ l1 tl2 + + (* add elements of [l], all of which have hash [k], to [t] *) + let add_list_hash_ k l t = + List.fold_left + (fun t x -> add_rec_ k x t) + t l + + let rec union a b = match a.cell, b.cell with + | E, _ -> b + | _, E -> a + | L (k1, l1), L(k2, l2) when k1==k2 -> + mk_leaf_ k1 (union_list_ l1 l2) (* merge leaves *) + | L (k, l), _ -> add_list_hash_ k l b + | _, L (k, l) -> add_list_hash_ k l a + | N (p1, m1, l1, r1), N (p2, m2, l2, r2) -> + if p1 = p2 && m1 = m2 + then mk_node_ p1 m1 (union l1 l2) (union r1 r2) + else if m1 < m2 && is_prefix_ ~prefix:p2 p1 ~bit:m1 + then if bit_is_0_ p2 ~bit:m1 + then hashcons_ (N (p1, m1, union l1 b, r1)) + else hashcons_ (N (p1, m1, l1, union r1 b)) + else if m1 > m2 && is_prefix_ ~prefix:p1 p2 ~bit:m2 + then if bit_is_0_ p1 ~bit:m2 + then hashcons_ (N (p2, m2, union l2 a, r2)) + else hashcons_ (N (p2, m2, l2, union r2 a)) + else join_ a p1 b p2 + + (*$Q + Q.(list int) (fun l -> \ + let module S = Make(CCInt) in \ + let s = S.of_list l in S.equal s (S.union s s)) + *) + + let rec inter_list_ l1 l2 = match l1, l2 with + | [], _ + | _, [] -> [] + | x1 :: tl1, x2 :: tl2 -> + match E.compare x1 x2 with + | 0 -> x1 :: inter_list_ tl1 tl2 + | c when c<0 -> inter_list_ tl1 l2 + | _ -> inter_list_ l1 tl2 + + let rec inter a b = match a.cell, b.cell with + | E, _ | _, E -> empty + | L (k1, l1), L (k2, l2) when k1==k2 -> + mk_leaf_ k1 (inter_list_ l1 l2) + | L _, _ + | _, L _ -> empty + | N (p1, m1, l1, r1), N (p2, m2, l2, r2) -> + if p1 = p2 && m1 = m2 + then mk_node_ p1 m1 (inter l1 l2) (inter r1 r2) + else if m1 < m2 && is_prefix_ ~prefix:p2 p1 ~bit:m1 + then if bit_is_0_ p2 ~bit:m1 + then inter l1 b + else inter r1 b + else if m1 > m2 && is_prefix_ ~prefix:p1 p2 ~bit:m2 + then if bit_is_0_ p1 ~bit:m2 + then inter l2 a + else inter r2 a + else empty + + (*$Q + Q.(list int) (fun l -> \ + let module S = Make(CCInt) in \ + let s = S.of_list l in S.equal s (S.inter s s)) + *) + + (* TODO: difference *) + + (** {2 Whole-collection operations} *) + + type 'a sequence = ('a -> unit) -> unit + type 'a gen = unit -> 'a option + + let add_list t l = List.fold_left (fun t x -> add x t) t l + + let of_list l = add_list empty l + + let to_list t = fold (fun x l -> x:: l) t [] + + (*$Q + Q.(list int) (fun l -> \ + let module S = Make(CCInt) in \ + S.of_list l |> S.cardinal = List.length l) + *) + + let add_seq t seq = + let t = ref t in + seq (fun x -> t := add x !t); + !t + + let of_seq seq = add_seq empty seq + + let to_seq t yield = iter yield t +end diff --git a/src/data/CCHashconsedSet.mli b/src/data/CCHashconsedSet.mli new file mode 100644 index 00000000..7f824ea0 --- /dev/null +++ b/src/data/CCHashconsedSet.mli @@ -0,0 +1,103 @@ + +(* +copyright (c) 2013-2015, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Hashconsed Sets} + + Sets are hashconsed, so that set equality is physical equality. Some + sub-structure that is common to several sets is also perfectly shared. + +{b status: unstable} +@since NEXT_RELEASE *) + +module type ELT = sig + type t + + val compare : t -> t -> int + (** Total order *) + + val hash : t -> int + (** Deterministic *) +end + +module type S = sig + type elt + + type t + (** Set of elements *) + + val empty : t + + val singleton : elt -> t + + val doubleton : elt -> elt -> t + + val mem : elt -> t -> bool + + val equal : t -> t -> bool + (** Fast equality test [O(1)] *) + + val compare : t -> t -> int + (** Fast (arbitrary) comparisontest [O(1)] *) + + val add : elt -> t -> t + + val remove : elt -> t -> t + + val cardinal : t -> int + + val iter : (elt -> unit) -> t -> unit + (** Iterate on elements, in no particular order *) + + val fold : (elt -> 'a -> 'a) -> t -> 'a -> 'a + (** fold on elements, in arbitrary order *) + + val choose : t -> elt option + + val choose_exn : t -> elt + + val union : t -> t -> t + + val inter : t -> t -> t + + (** {2 Whole-collection operations} *) + + type 'a sequence = ('a -> unit) -> unit + type 'a gen = unit -> 'a option + + val add_list : t -> elt list -> t + + val of_list : elt list -> t + + val to_list : t -> elt list + + val add_seq : t -> elt sequence -> t + + val of_seq : elt sequence -> t + + val to_seq : t -> elt sequence +end + +module Make(E : ELT) : S with type elt = E.t