diff --git a/VPTree.ml b/VPTree.ml new file mode 100644 index 00000000..b0dd7e29 --- /dev/null +++ b/VPTree.ml @@ -0,0 +1,223 @@ +(* +copyright (c) 2013, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Vantage-Point Tree} *) + +module type METRIC_SPACE = sig + type t + (** Elements of the metric space *) + + type num + (** Numeric type used for distances *) + + val distance : t -> t -> num + (** Distance between two points. It must satisfy the following invariants: + + - [distance x y = distance y x] + - [distance x y <= distance x z + distance z y] + - [distance x y >= 0] + - [distance x y = 0] if and only if [x] and [y] are the same point. + *) + + val add : num -> num -> num + (** Addition of two distances (associative, commutative) *) + + val compare : num -> num -> int + (** Total ordering on the numeric type used for distances *) +end +(** {2 Utils} *) + +type 'a sequence = ('a -> unit) -> unit + +module LazyList = struct + type 'a t = + | Nil + | Cons of 'a * 'a t Lazy.t + + let rec take n l = match l with + | _ when n = 0 -> Nil + | Nil -> Nil + | Cons (x, (lazy l')) -> Cons (x, lazy (take (n-1) l')) + + let rec iter l k = match l with + | Nil -> () + | Cons (x, (lazy l')) -> k x; iter l' k + + let rec to_list l = match l with + | Nil -> [] + | Cons (x, (lazy l')) -> x :: (to_list l') +end + +(** {3 Mutable heap} *) +module Heap = struct + (** Implementation from http://en.wikipedia.org/wiki/Skew_heap *) + + type 'a t = { + mutable tree : 'a tree; + cmp : 'a -> 'a -> int; + } (** A pairing tree heap with the given comparison function *) + and 'a tree = + | Empty + | Node of 'a * 'a tree * 'a tree + + let empty ~cmp = { + tree = Empty; + cmp; + } + + let is_empty h = + match h.tree with + | Empty -> true + | Node _ -> false + + let rec union ~cmp t1 t2 = match t1, t2 with + | Empty, _ -> t2 + | _, Empty -> t1 + | Node (x1, l1, r1), Node (x2, l2, r2) -> + if cmp x1 x2 <= 0 + then Node (x1, union ~cmp t2 r1, l1) + else Node (x2, union ~cmp t1 r2, l2) + + let insert h x = + h.tree <- union ~cmp:h.cmp (Node (x, Empty, Empty)) h.tree + + let pop h = match h.tree with + | Empty -> raise Not_found + | Node (x, l, r) -> + h.tree <- union ~cmp:h.cmp l r; + x +end + +(** {2 VPTree functorial interface} *) + +module type S = sig + module Space : METRIC_SPACE + + type key = Space.t + + type +'a t + (** VPTree that maps to data of type 'a *) + + val member : _ t -> key -> bool + (** Is the given key, member of the tree? *) + + val find_closest : 'a t -> key -> (key * 'a * int) LazyList.t + (** [find_closest tree key] finds the points of [tree] + that are the closest to [key], in increasing order. Enumerates all + the points eventually, if the whole list is explored. + Each tuple also contains the distance to the [key]. *) + + val of_array : (key * 'a) array -> 'a t + (** Build a VPTree from an array of bindings. The array is modified in + place (values are swapped). + If a key occurs several times in the sequence, one of the value + it maps to is chosen arbitrarily. *) + + val of_list : (key * 'a) list -> 'a t + (** Build a VPTree from a list *) + + val of_seq : (key * 'a) sequence -> 'a t + + val of_lazy_list : (key * 'a) LazyList.t -> 'a t + + val size : _ t -> int + (** Find the number of key/value pairs in the tree (linear time) *) +end + +(* we follow François Berenger's implementation, but simpler, because we + want to retrieve several points and yet keep the implementation reasonably + simple. *) +module Make(Space : METRIC_SPACE) = struct + module Space = Space + + type key = Space.t + type num = Space.num + + type 'a t = + | Empty + | Node of 'a node * num * 'a t * 'a t + + and 'a node = { + key : key; + value : 'a; + tree : 'a t; + } + + (* insert a tree in the heap of trees to explore *) + let _insert_heap h key tree = match tree with + | Empty -> () + | Node (node, _, _, _) -> + let d = Space.distance key node.key in + Heap.insert h (tree, d) + + (* explore the given heap *) + let rec _explore_heap h key = + if Heap.is_empty h + then LazyList.Nil + else match Heap.pop h with + | Empty -> _explore_heap h key + | + + (* find closest points to [key] *) + let find_closest tree key = + match tree with + | Empty -> LazyList.Nil + | Node _ -> + let cmp (t1,d1) (t2,d2) = Space.compare d1 d2 in + let h = Heap.empty ~cmp in + _insert_heap h key tree; + _explore_heap h key + + let member tree key = + match tree with + | None -> false + | Some tree -> + match find_closest tree key with + | LazyList.Nil -> false + | LazyList.Cons ((_, _, 0), _) -> true + | _ -> false + + let size t = + let rec size t = match t with + | Leaf _ -> 1 + | Pair _ -> 2 + | Node (_, _, _, t1, t2) -> size t1 + size t2 + 1 + in match t with + | None -> 0 + | Some t -> size t + + (* utilities to build the tree *) + + let of_array a = assert false (* TODO *) + + let of_list l = of_array (Array.of_list l) + + let of_seq seq = + let l = ref [] in + seq (fun x -> l := x :: !l); + of_list !l + + let of_lazy_list l = of_seq (LazyList.iter l) (* TODO optimize *) +end diff --git a/VPTree.mli b/VPTree.mli new file mode 100644 index 00000000..21f441f4 --- /dev/null +++ b/VPTree.mli @@ -0,0 +1,112 @@ + +(* +copyright (c) 2013, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Vantage-Point Tree} + +A data structure used to store and retrieve elements that belong to a +metric space. References are: + + {{: https://github.com/UnixJunkie/vantage_point_tree_from_codepad/blob/master/vp_tree.ml} François Berenger's implementation} + {{: http://en.wikipedia.org/wiki/Vantage-point_tree} Wikipedia} +*) + +module type METRIC_SPACE = sig + type t + (** Elements of the metric space *) + + type num + (** Numeric type used for distances *) + + val distance : t -> t -> num + (** Distance between two points. It must satisfy the following invariants: + + - [distance x y = distance y x] + - [distance x y <= distance x z + distance z y] + - [distance x y >= 0] + - [distance x y = 0] if and only if [x] and [y] are the same point. + *) + + val add : num -> num -> num + (** Addition of two distances (associative, commutative) *) + + val compare : num -> num -> int + (** Total ordering on the numeric type used for distances; must + be compatible with {!add} *) +end + +(** {2 Utils} *) + +type 'a sequence = ('a -> unit) -> unit + +module LazyList : sig + type +'a t = + | Nil + | Cons of 'a * 'a t Lazy.t + + val take : int -> 'a t -> 'a t + + val to_list : 'a t -> 'a list + + val iter : 'a t -> ('a -> unit) -> unit +end + +(** {2 VPTree functorial interface} *) + +module type S = sig + module Space : METRIC_SPACE + + type key = Space.t + + type +'a t + (** VPTree that maps to data of type 'a *) + + val member : _ t -> key -> bool + (** Is the given key, member of the tree? *) + + val find_closest : 'a t -> key -> (key * 'a * int) LazyList.t + (** [find_closest tree key] finds the points of [tree] + that are the closest to [key], in increasing order. Enumerates all + the points eventually, if the whole list is explored. + Each tuple also contains the distance to the [key]. *) + + val of_array : (key * 'a) array -> 'a t + (** Build a VPTree from an array of bindings. The array is modified in + place (values are swapped). + If a key occurs several times in the sequence, one of the value + it maps to is chosen arbitrarily. *) + + val of_list : (key * 'a) list -> 'a t + (** Build a VPTree from a list *) + + val of_seq : (key * 'a) sequence -> 'a t + + val of_lazy_list : (key * 'a) LazyList.t -> 'a t + + val size : _ t -> int + (** Find the number of key/value pairs in the tree (linear time) *) +end + +module Make(Space : METRIC_SPACE) : S with module Space = Space diff --git a/_oasis b/_oasis index 945d8cb0..190ce0c0 100644 --- a/_oasis +++ b/_oasis @@ -40,7 +40,7 @@ Library "containers" Vector, Bij, PiCalculus, Bencode, Sexp, RAL, MultiSet, UnionFind, SmallSet, Leftistheap, AbsSet, CSM, MultiMap, ActionMan, BV, QCheck, BencodeOnDisk, Show, TTree, - HGraph + HGraph, VPTree BuildDepends: unix Library "containers_thread" diff --git a/_tags b/_tags index ab5e37c9..6916ca30 100644 --- a/_tags +++ b/_tags @@ -1,5 +1,5 @@ # OASIS_START -# DO NOT EDIT (digest: dcf1c0cacacb67335a863b18858eaeb7) +# DO NOT EDIT (digest: 96f10d0656c8edf5564bc01896f92844) # Ignore VCS directories, you can use the same kind of rule outside # OASIS_START/STOP if you want to exclude directories that contains # useless stuff for the build process @@ -52,6 +52,7 @@ "show.cmx": for-pack(Containers) "tTree.cmx": for-pack(Containers) "hGraph.cmx": for-pack(Containers) +"VPTree.cmx": for-pack(Containers) # Library containers_thread "containers_thread.cmxs": use_containers_thread <*.ml{,i}>: pkg_threads diff --git a/containers.mlpack b/containers.mlpack index 3132bdde..8aefe1f0 100644 --- a/containers.mlpack +++ b/containers.mlpack @@ -1,5 +1,5 @@ # OASIS_START -# DO NOT EDIT (digest: c05f285f4d4b4832b21ddd28863f4edf) +# DO NOT EDIT (digest: eb2e2715cb08b7b87800e38c24157fce) Cache Deque Gen @@ -37,4 +37,5 @@ BencodeOnDisk Show TTree HGraph +VPTree # OASIS_STOP diff --git a/containers.odocl b/containers.odocl index 48ce7541..4598ddf7 100644 --- a/containers.odocl +++ b/containers.odocl @@ -1,5 +1,5 @@ # OASIS_START -# DO NOT EDIT (digest: b68696a2df00165022fe21a389026f4f) +# DO NOT EDIT (digest: 8a4d45e8f3b44572f3dfbac03926e207) Cache Deque Gen @@ -37,5 +37,6 @@ BencodeOnDisk Show TTree HGraph +VPTree Behavior # OASIS_STOP diff --git a/setup.ml b/setup.ml index be2a5d7e..49da970b 100644 --- a/setup.ml +++ b/setup.ml @@ -1,7 +1,7 @@ (* setup.ml generated for the first time by OASIS v0.3.0 *) (* OASIS_START *) -(* DO NOT EDIT (digest: b3286e91d3acc2931801b7c5da3859c2) *) +(* DO NOT EDIT (digest: 79fce0538fbf243815415d95729ceb1e) *) (* Regenerated by OASIS v0.3.0 Visit http://oasis.forge.ocamlcore.org for more information and @@ -5904,7 +5904,8 @@ let setup_t = "BencodeOnDisk"; "Show"; "TTree"; - "HGraph" + "HGraph"; + "VPTree" ]; lib_pack = true; lib_internal_modules = []; @@ -6225,7 +6226,7 @@ let setup_t = }; oasis_fn = Some "_oasis"; oasis_version = "0.3.0"; - oasis_digest = Some "\139{\144\225j\1545\001\221\138R\129\177\182q*"; + oasis_digest = Some "\182\214L^I\190_b`\140ur\005\163\127\217"; oasis_exec = None; oasis_setup_args = []; setup_update = false; @@ -6233,6 +6234,6 @@ let setup_t = let setup () = BaseSetup.setup setup_t;; -# 6237 "setup.ml" +# 6238 "setup.ml" (* OASIS_STOP *) let () = setup ();;