diff --git a/_oasis b/_oasis index 1f182e98..c1421116 100644 --- a/_oasis +++ b/_oasis @@ -41,7 +41,7 @@ Library "containers" Modules: CCVector, CCDeque, CCGen, CCSequence, CCFQueue, CCMultiMap, CCMultiSet, CCBV, CCPrint, CCPersistentHashtbl, CCLeftistheap, CCList, CCOpt, CCPair, CCFun, CCHash, - CCKList, CCInt, CCBool, CCArray, CCBatch + CCKList, CCInt, CCBool, CCArray, CCBatch, CCLinq FindlibName: containers Library "containers_string" @@ -60,7 +60,7 @@ Library "containers_misc" Bij, PiCalculus, Bencode, Sexp, RAL, UnionFind, SmallSet, AbsSet, CSM, ActionMan, QCheck, BencodeOnDisk, TTree, - HGraph, Automaton, Conv, Bidir, Iteratee, Linq, + HGraph, Automaton, Conv, Bidir, Iteratee, Ty, Tell, BencodeStream, RatTerm, Cause, AVL, ParseReact BuildDepends: unix,containers FindlibName: misc diff --git a/core/CCLinq.ml b/core/CCLinq.ml index a2cda146..9728d414 100644 --- a/core/CCLinq.ml +++ b/core/CCLinq.ml @@ -30,39 +30,61 @@ type 'a sequence = ('a -> unit) -> unit type 'a equal = 'a -> 'a -> bool type 'a ord = 'a -> 'a -> int type 'a hash = 'a -> int -type 'a klist = unit -> [ `Nil | `Cons of 'a * 'a klist ] let _id x = x +type 'a search_result = + | SearchContinue + | SearchStop of 'a + module Coll = struct type 'a t = - | Seq of 'a sequence - | List of 'a list + | Seq : 'a sequence -> 'a t + | List : 'a list -> 'a t + | Set : (module CCSequence.Set.S + with type elt = 'a and type t = 'b) * 'b -> 'a t let of_seq s = Seq s let of_list l = List l let of_array a = Seq (CCSequence.of_array a) - let empty = List [] - let to_seq = function + let set_of_seq (type elt) ?(cmp=Pervasives.compare) seq = + let module S = CCSequence.Set.Make(struct + type t = elt + let compare = cmp + end) in + let set = S.of_seq seq in + Set ((module S), set) + + let to_seq (type elt) = function | Seq s -> s | List l -> (fun k -> List.iter k l) + | Set (m, set) -> + let module S = (val m : CCSequence.Set.S + with type elt = elt and type t = 'b) in + S.to_seq set - let to_list = function + let to_list (type elt) = function | Seq s -> CCSequence.to_list s | List l -> l + | Set (m, set) -> + let module S = (val m : CCSequence.Set.S + with type elt = elt and type t = 'b) in + S.elements set let _fmap ~lst ~seq c = match c with | List l -> List (lst l) | Seq s -> Seq (seq s) + | Set _ -> + List (lst (to_list c)) - let _fold ~lst ~seq acc c = match c with - | List l -> List.fold_left lst acc l - | Seq s -> CCSequence.fold seq acc s - - let iter f c = match c with - | List l -> List.iter f l - | Seq s -> s f + let fold (type elt) f acc c = match c with + | List l -> List.fold_left f acc l + | Seq s -> CCSequence.fold f acc s + | Set (m, set) -> + let module S = (val m : CCSequence.Set.S + with type elt = elt and type t = 'b) in + S.fold (fun x acc -> f acc x) set acc let map f c = _fmap ~lst:(List.map f) ~seq:(CCSequence.map f) c @@ -72,16 +94,79 @@ module Coll = struct let flat_map f c = let c' = to_seq c in - Seq (CCSequence.flatMap f c') + Seq (CCSequence.flatMap (fun x -> to_seq (f x)) c') let filter_map f c = _fmap ~lst:(CCList.filter_map f) ~seq:(CCSequence.fmap f) c - let size = function + let size (type elt) = function | List l -> List.length l | Seq s -> CCSequence.length s + | Set (m, set) -> + let module S = (val m : CCSequence.Set.S + with type elt = elt and type t = 'b) in + S.cardinal set - let fold f acc c = _fold ~lst:f ~seq:f acc c + let choose (type elt) = function + | List [] -> None + | List (x::_) -> Some x + | Seq s -> + begin match CCSequence.take 1 s |> CCSequence.to_list with + | [x] -> Some x + | _ -> None + end + | Set (m, set) -> + let module S = (val m : CCSequence.Set.S + with type elt = elt and type t = 'b) in + try Some (S.choose set) with Not_found -> None + + let take n c = + _fmap ~lst:(CCList.take n) ~seq:(CCSequence.take n) c + + exception MySurpriseExit + + let _seq_take_while p seq k = + try + seq (fun x -> if not (p x) then k x else raise MySurpriseExit) + with MySurpriseExit -> () + + let take_while p c = + to_seq c |> _seq_take_while p |> of_seq + + let distinct ~cmp c = set_of_seq ~cmp (to_seq c) + + let sort cmp c = match c with + | List l -> List (List.sort cmp l) + | _ -> + to_seq c |> set_of_seq ~cmp + + let search obj c = + let _search_seq obj seq = + let ret = ref None in + begin try + seq (fun x -> match obj#check x with + | SearchContinue -> () + | SearchStop y -> ret := Some y; raise MySurpriseExit); + with MySurpriseExit -> () + end; + match !ret with + | None -> obj#failure + | Some x -> x + in + to_seq c |> _search_seq obj + + let contains (type elt) ~eq x c = match c with + | List l -> List.exists (eq x) l + | Seq s -> CCSequence.exists (eq x) s + | Set (m, set) -> + let module S = (val m : CCSequence.Set.S + with type elt = elt and type t = 'b) in + (* XXX: here we don't use the equality relation *) + try + let y = S.find x set in + assert (eq x y); + true + with Not_found -> false end type 'a collection = 'a Coll.t @@ -95,7 +180,14 @@ module Map = struct to_seq : ('a * 'b) sequence; } - let make_hash (type key) ?(eq=(=)) ?(hash=Hashtbl.hash) seq = + type ('a, 'b) build = { + mutable cur : ('a, 'b) t; + add : 'a -> 'b -> unit; + update : 'a -> ('b option -> 'b option) -> unit; + } + + (* careful to use this map linearly *) + let make_hash (type key) ?(eq=(=)) ?(hash=Hashtbl.hash) () = let module H = Hashtbl.Make(struct type t = key let equal = eq @@ -103,57 +195,99 @@ module Map = struct end) in (* build table *) let tbl = H.create 32 in - seq - (fun (k,v) -> - let l = try H.find tbl k with Not_found -> [] in - H.replace tbl k (v::l) - ); - (* provide the multimap interface *) - let to_seq cont = H.iter (fun k v -> cont (k, Coll.of_list v)) tbl - in - { + let cur = { is_empty = (fun () -> H.length tbl = 0); size = (fun () -> H.length tbl); get = (fun k -> - try Some (Coll.of_list (H.find tbl k)) + try Some (H.find tbl k) with Not_found -> None); - fold = (fun f acc -> H.fold (fun k v acc -> f acc k (Coll.of_list v)) tbl acc); - to_seq; + fold = (fun f acc -> H.fold (fun k v acc -> f acc k v) tbl acc); + to_seq = (fun k -> H.iter (fun key v -> k (key,v)) tbl); + } in + { cur; + add = (fun k v -> H.replace tbl k v); + update = (fun k f -> + match (try f (Some (H.find tbl k)) with Not_found -> f None) with + | None -> H.remove tbl k + | Some v' -> H.replace tbl k v'); } - let make_map (type key) (type value) - ?(cmp_key=Pervasives.compare) ?(cmp_val=Pervasives.compare) seq = + let make_cmp (type key) ?(cmp=Pervasives.compare) () = let module M = CCSequence.Map.Make(struct type t = key - let compare = cmp_key + let compare = cmp end) in - let module S = CCSequence.Set.Make(struct - type t = value - let compare = cmp_val - end) in - let _map_set set = Coll.of_seq (S.to_seq set) in - let map = CCSequence.fold - (fun map (k,v) -> - let set = try M.find k map with Not_found -> S.empty in - M.add k (S.add v set) map - ) M.empty seq - in - let to_seq = - M.to_seq map |> CCSequence.map (fun (k,v) -> k, _map_set v) - in - { - is_empty = (fun () -> M.is_empty map); - size = (fun () -> M.cardinal map); + let map = ref M.empty in + let cur = { + is_empty = (fun () -> M.is_empty !map); + size = (fun () -> M.cardinal !map); get = (fun k -> - try Some (_map_set (M.find k map)) + try Some (M.find k !map) with Not_found -> None); fold = (fun f acc -> M.fold - (fun key set acc -> f acc key (_map_set set)) map acc + (fun key set acc -> f acc key set) !map acc ); - to_seq; + to_seq = M.to_seq !map; + } in + { + cur; + add = (fun k v -> map := M.add k v !map); + update = (fun k f -> + match (try f (Some (M.find k !map)) with Not_found -> f None) with + | None -> map := M.remove k !map + | Some v' -> map := M.add k v' !map); } + type 'a key_info = { + eq : 'a equal option; + cmp : 'a ord option; + hash : 'a hash option; + } + + let default_key_info = { + eq=None; cmp=None; hash=None; + } + + let make_info info = + match info with + | { hash=None; _} + | { eq=None; _} -> + begin match info.cmp with + | None -> make_cmp () + | Some cmp -> make_cmp ~cmp () + end + | {eq=Some eq; hash=Some hash; _} -> make_hash ~eq ~hash () + + let multiset build seq = + seq (fun (k,v) -> + build.update k (function + | None -> Some [v] + | Some l -> Some (v::l))); + { is_empty = build.cur.is_empty; + size = build.cur.size; + get = (fun k -> match build.cur.get k with + | None -> None + | Some v -> Some (Coll.of_list v)); + fold = (fun f acc -> + build.cur.fold (fun acc k v -> f acc k (Coll.of_list v)) acc); + to_seq = build.cur.to_seq + |> CCSequence.map (fun (k,v) -> k,Coll.of_list v); + } + + let multimap_cmp ?cmp seq = + let build = make_cmp ?cmp () in + multiset build seq + + let count build seq = + seq (fun x -> + let n = match build.cur.get x with + | None -> 1 + | Some n -> n+1 + in + build.add x n); + build.cur + let get m x = m.get x let get_exn m x = @@ -164,26 +298,14 @@ module Map = struct let size m = m.size () let to_seq m = m.to_seq - - type 'a key_info = { - eq : 'a equal option; - cmp : 'a ord option; - hash : 'a hash option; - } end (** {2 Query operators} *) -type safe = Safe -type unsafe = Unsafe type (_,_) safety = | Safe : ('a, 'a option) safety | Unsafe : ('a, 'a) safety -type 'a search_result = - | SearchContinue - | SearchStop of 'a - type (_, _) unary = | Map : ('a -> 'b) -> ('a collection, 'b collection) unary | GeneralMap : ('a -> 'b) -> ('a, 'b) unary @@ -198,21 +320,21 @@ type (_, _) unary = | Take : int -> ('a collection, 'a collection) unary | TakeWhile : ('a -> bool) -> ('a collection, 'a collection) unary | Sort : 'a ord -> ('a collection, 'a collection) unary - | Distinct : 'a ord option * 'a equal option * 'a hash option - -> ('a collection, 'a collection) unary + | Distinct : 'a ord -> ('a collection, 'a collection) unary | Search : < check: ('a -> 'b search_result); failure : 'b; > -> ('a collection, 'b) unary + | Contains : 'a equal * 'a -> ('a collection, bool) unary | Get : ('b,'c) safety * 'a -> (('a,'b) Map.t, 'c) unary - | GroupBy : 'b ord * 'a ord * ('a -> 'b) + | GroupBy : 'b ord * ('a -> 'b) -> ('a collection, ('b,'a collection) Map.t) unary | Count : 'a ord -> ('a collection, ('a, int) Map.t) unary type ('a,'b,'key,'c) join_descr = { join_key1 : 'a -> 'key; join_key2 : 'b -> 'key; - join_merge : 'key -> 'a -> 'b -> 'c; + join_merge : 'key -> 'a -> 'b -> 'c option; join_key : 'key Map.key_info; } @@ -234,7 +356,6 @@ type (_, _, _) binary = | Product : ('a collection, 'b collection, ('a*'b) collection) binary | Append : ('a collection, 'a collection, 'a collection) binary | SetOp : set_op * 'a ord -> ('a collection, 'a collection, 'a collection) binary - | Inter : 'a ord -> ('a collection, 'a collection, 'a collection) binary (* type of queries that return a 'a *) and 'a t = @@ -258,12 +379,6 @@ let start_hashtbl h = let start_seq seq = Start (Coll.of_seq seq) -(** {6 Composition} *) - -let apply u q = Unary (u, q) - -let (>>>) = apply - (** {6 Execution} *) let rec _optimize : type a. a t -> a t @@ -274,26 +389,98 @@ let rec _optimize : type a. a t -> a t | Binary (b, q1, q2) -> _optimize_binary b (_optimize q1) (_optimize q2) | QueryMap (f, q) -> QueryMap (f, _optimize q) + | Bind _ -> q (* cannot optimize before execution *) and _optimize_unary : type a b. (a,b) unary -> a t -> b t = fun u q -> match u, q with | Map f, Unary (Map g, q') -> _optimize_unary (Map (fun x -> f (g x))) q' + | Filter p, Unary (Map f, cont) -> + _optimize_unary + (FilterMap (fun x -> let y = f x in if p y then Some y else None)) + cont + | Map f, Unary (Filter p, cont) -> + _optimize_unary + (FilterMap (fun x -> if p x then Some (f x) else None)) + cont + | Map f, Binary (Append, q1, q2) -> + _optimize_binary Append (Unary (u, q1)) (Unary (u, q2)) + | Filter p, Binary (Append, q1, q2) -> + _optimize_binary Append (Unary (u, q1)) (Unary (u, q2)) + | Fold (f,acc), Unary (Map f', cont) -> + _optimize_unary + (Fold ((fun acc x -> f acc (f' x)), acc)) + cont + | Reduce (safety, start, mix, stop), Unary (Map f, cont) -> + _optimize_unary + (Reduce (safety, + (fun x -> start (f x)), + (fun x acc -> mix (f x) acc), + stop)) + cont + | Size, Unary (Map _, cont) -> + _optimize_unary Size cont (* ignore the map! *) + | Size, Unary (Sort _, cont) -> + _optimize_unary Size cont | _ -> Unary (u,q) - (* TODO *) + (* TODO: other cases *) and _optimize_binary : type a b c. (a,b,c) binary -> a t -> b t -> c t = fun b q1 q2 -> match b, q1, q2 with - | _ -> Binary (b, q1, q2) (* TODO *) - + | _ -> Binary (b, q1, q2) (* TODO *) (* apply a unary operator on a collection *) let _do_unary : type a b. (a,b) unary -> a -> b = fun u c -> match u with | Map f -> Coll.map f c + | GeneralMap f -> f c | Filter p -> Coll.filter p c - | Fold (f, acc) -> Coll.fold f acc c (* TODO: optimize *) + | Fold (f, acc) -> Coll.fold f acc c + | Reduce (safety, start, mix, stop) -> + let acc = Coll.to_seq c + |> CCSequence.fold + (fun acc x -> match acc with + | None -> Some (start x) + | Some acc -> Some (mix x acc) + ) None + in + begin match acc, safety with + | Some x, Safe -> Some (stop x) + | None, Safe -> None + | Some x, Unsafe -> stop x + | None, Unsafe -> invalid_arg "reduce: empty collection" + end + | Size -> Coll.size c + | Choose Safe -> Coll.choose c + | Choose Unsafe -> + begin match Coll.choose c with + | Some x -> x + | None -> invalid_arg "choose: empty collection" + end + | FilterMap f -> Coll.filter_map f c + | FlatMap f -> Coll.flat_map f c + | Take n -> Coll.take n c + | TakeWhile p -> Coll.take_while p c + | Sort cmp -> Coll.sort cmp c + | Distinct cmp -> Coll.distinct ~cmp c + | Search obj -> Coll.search obj c + | Get (Safe, k) -> Map.get c k + | Get (Unsafe, k) -> Map.get_exn c k + | GroupBy (cmp,f) -> + Coll.to_seq c + |> CCSequence.map (fun x -> f x, x) + |> Map.multimap_cmp ~cmp + | Contains (eq, x) -> Coll.contains ~eq x c + | Count cmp -> + Coll.to_seq c + |> Map.count (Map.make_cmp ~cmp ()) + (* TODO: join of two collections *) let _do_join ~join c1 c2 = + let _build = Map.make_info join.join_key in + assert false + +(* TODO *) +let _do_group_join ~gjoin c1 c2 = assert false let _do_product c1 c2 = @@ -303,7 +490,20 @@ let _do_product c1 c2 = let _do_binary : type a b c. (a, b, c) binary -> a -> b -> c = fun b c1 c2 -> match b with | Join join -> _do_join ~join c1 c2 + | GroupJoin gjoin -> _do_group_join ~gjoin c1 c2 | Product -> _do_product c1 c2 + | Append -> + Coll.of_seq (CCSequence.append (Coll.to_seq c1) (Coll.to_seq c2)) + | SetOp (Inter,cmp) -> + (* use a join *) + _do_join ~join:{ + join_key1=_id; + join_key2=_id; + join_merge=(fun _ a b -> Some a); + join_key=Map.({default_key_info with cmp=Some cmp; }) + } c1 c2 + | SetOp (Union,cmp) -> failwith "union: not implemented" (* TODO *) + | SetOp (Diff,cmp) -> failwith "diff: not implemented" (* TODO *) let rec _run : type a. opt:bool -> a t -> a = fun ~opt q -> match q with @@ -326,8 +526,6 @@ let map f q = Unary (Map f, q) let filter p q = Unary (Filter p, q) -let size q = Unary (Size, q) - let choose q = Unary (Choose Safe, q) let choose_exn q = Unary (Choose Unsafe, q) @@ -346,8 +544,8 @@ let take_while p q = Unary (TakeWhile p, q) let sort ~cmp q = Unary (Sort cmp, q) -let distinct ?cmp ?eq ?hash () q = - Unary (Distinct (cmp,eq,hash), q) +let distinct ?(cmp=Pervasives.compare) () q = + Unary (Distinct cmp, q) let get key q = Unary (Get (Safe, key), q) @@ -366,8 +564,8 @@ let map_to_seq_flatten q = in Unary (GeneralMap f, q) -let group_by ?(cmp_key=Pervasives.compare) ?(cmp_val=Pervasives.compare) f q = - Unary (GroupBy (cmp_key,cmp_val,f), q) +let group_by ?(cmp=Pervasives.compare) f q = + Unary (GroupBy (cmp,f), q) let count ?(cmp=Pervasives.compare) () q = Unary (Count cmp, q) @@ -401,6 +599,15 @@ let max_exn q = Unary (Reduce (Unsafe, _id, Pervasives.max, _id), q) let min_exn q = Unary (Reduce (Unsafe, _id, Pervasives.min, _id), q) let average_exn q = Unary (Reduce (Unsafe, _avg_start, _avg_mix, _avg_stop), q) +let is_empty q = + Unary (Search (object + method check _ = SearchStop false (* stop in case there is an element *) + method failure = true + end), q) + +let contains ?(eq=(=)) x q = + Unary (Contains (eq, x), q) + let for_all p q = Unary (Search (object method check x = if p x then SearchContinue else SearchStop false diff --git a/core/CCLinq.mli b/core/CCLinq.mli index b9915cc3..885c8840 100644 --- a/core/CCLinq.mli +++ b/core/CCLinq.mli @@ -113,8 +113,7 @@ val take_while : ('a -> bool) -> 'a collection t -> 'a collection t val sort : cmp:'a ord -> 'a collection t -> 'a collection t (** Sort items by the given comparison function *) -val distinct : ?cmp:'a ord -> ?eq:'a equal -> ?hash:'a hash -> - unit -> 'a collection t -> 'a collection t +val distinct : ?cmp:'a ord -> unit -> 'a collection t -> 'a collection t (** Remove duplicate elements from the input collection. All elements in the result are distinct. *) @@ -135,7 +134,7 @@ val map_to_seq_flatten : ('a,'b collection) Map.t t -> ('a*'b) collection t (** {6 Aggregation} *) -val group_by : ?cmp_key:'b ord -> ?cmp_val:'a ord -> +val group_by : ?cmp:'b ord -> ('a -> 'b) -> 'a collection t -> ('b,'a collection) Map.t t (** [group_by f] takes a collection [c] as input, and returns a multimap [m] such that for each [x] in [c], @@ -163,8 +162,12 @@ val reduce_exn : ('a -> 'b) -> ('a -> 'b -> 'b) -> ('b -> 'c) -> (** Same as {!reduce} but fails on empty collections. @raise Invalid_argument if the collection is empty *) +val is_empty : 'a collection t -> bool t + val sum : int collection t -> int t +val contains : ?eq:'a equal -> 'a -> 'a collection t -> bool t + val average : int collection t -> int option t val max : int collection t -> int option t val min : int collection t -> int option t @@ -182,13 +185,14 @@ val find_map : ('a -> 'b option) -> 'a collection t -> 'b option t val join : ?cmp:'key ord -> ?eq:'key equal -> ?hash:'key hash -> ('a -> 'key) -> ('b -> 'key) -> - merge:('key -> 'a -> 'b -> 'c) -> + merge:('key -> 'a -> 'b -> 'c option) -> 'a collection t -> 'b collection t -> 'c collection t (** [join key1 key2 ~merge] is a binary operation that takes two collections [a] and [b], projects their elements resp. with [key1] and [key2], and combine values [(x,y)] from [(a,b)] with the same [key] - using [merge]. *) + using [merge]. If [merge] returns [None], the combination + of values is discarded. *) val group_join : ?cmp:'a ord -> ?eq:'a equal -> ?hash:'a hash -> ('b -> 'a) -> 'a collection t -> 'b collection t ->