diff --git a/levenshtein.ml b/levenshtein.ml index 438989db..7ce560a2 100644 --- a/levenshtein.ml +++ b/levenshtein.ml @@ -66,14 +66,16 @@ module type S = sig (** Levenshtein automaton *) val of_string : limit:int -> string_ -> automaton - (** Build an automaton from an array, with a maximal distance [limit] *) + (** Build an automaton from a string, with a maximal distance [limit]. + The automaton will accept strings whose {!edit_distance} to the + parameter is at most [limit]. *) val of_list : limit:int -> char_ list -> automaton (** Build an automaton from a list, with a maximal distance [limit] *) val debug_print : (out_channel -> char_ -> unit) -> out_channel -> automaton -> unit - (** Output the automaton on the given channel. *) + (** Output the automaton's structure on the given channel. *) val match_with : automaton -> string_ -> bool (** [match_with a s] matches the string [s] against [a], and returns @@ -85,7 +87,7 @@ module type S = sig module Index : sig type 'b t (** Index that maps strings to values of type 'b. Internally it is - based on a trie. *) + based on a trie. A string can only map to one value. *) val empty : 'b t (** Empty index *) @@ -93,20 +95,29 @@ module type S = sig val is_empty : _ t -> bool val add : 'b t -> string_ -> 'b -> 'b t - (** Add a char array to the index. If a value was already present - for this array it is replaced. *) + (** Add a pair string/value to the index. If a value was already present + for this string it is replaced. *) val remove : 'b t -> string_ -> 'b -> 'b t - (** Remove a string from the index. *) + (** Remove a string (and its associated value, if any) from the index. *) val retrieve : limit:int -> 'b t -> string_ -> 'b klist (** Lazy list of objects associated to strings close to the query string *) val of_list : (string_ * 'b) list -> 'b t + (** Build an index from a list of pairs of strings and values *) val to_list : 'b t -> (string_ * 'b) list + (** Extract a list of pairs from an index *) - (* TODO sequence/iteration functions *) + val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a + (** Fold over the stored pairs string/value *) + + val iter : (string_ -> 'b -> unit) -> 'b t -> unit + (** Iterate on the pairs *) + + val to_klist : 'b t -> (string_ * 'b) klist + (** Conversion to an iterator *) end end @@ -589,19 +600,45 @@ module Make(Str : STRING) = struct (fun acc (arr,v) -> add acc arr v) empty l - let to_list idx = + let fold f acc idx = let rec explore acc trail node = match node with | Node (opt, m) -> (* first, yield current value, if any *) let acc = match opt with | None -> acc - | Some v -> (Str.of_list (List.rev trail), v) :: acc + | Some v -> + let str = Str.of_list (List.rev trail) in + f acc str v in M.fold (fun c node' acc -> explore acc (c::trail) node') m acc in - explore [] [] idx + explore acc [] idx + + let iter f idx = + fold (fun () str v -> f str v) () idx + + let to_list idx = + fold (fun acc str v -> (str,v) :: acc) [] idx + + let to_klist idx = + let rec traverse node trail ~(fk:unit->(string_*'a) klist) = + match node with + | Node (opt, m) -> + (* all alternatives: continue exploring [m], or call [fk] *) + let fk = + M.fold + (fun c node' fk () -> traverse node' (c::trail) ~fk) + m fk + in + match opt with + | Some v -> + let str = Str.of_list (List.rev trail) in + `Cons ((str,v), fk) + | _ -> fk () (* fail... or explore subtrees *) + in + traverse idx [] ~fk:(fun () -> `Nil) end end diff --git a/levenshtein.mli b/levenshtein.mli index 2e7d9f0a..bb6afd3b 100644 --- a/levenshtein.mli +++ b/levenshtein.mli @@ -31,7 +31,11 @@ We take inspiration from http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata for the main algorithm and ideas. However some parts are adapted *) -(** {2 Abstraction over Strings} *) +(** {2 Abstraction over Strings} +Due to the existence of several encodings and string representations we +abstract over the type of strings. A string is a finite array of characters +(8-bits char, unicode runes, etc.) which provides a length operation +and a function to access the n-th character. *) module type STRING = sig type char_ @@ -43,7 +47,16 @@ module type STRING = sig val compare_char : char_ -> char_ -> int end -(** {2 Continuation list} *) +(** {2 Continuation list} + +This data structure is used to represent a list of result that is +evaluated only as far as the user wants. If the user only wants a few elements, +she doesn't pay for the remaining ones. + +In particular, when matching a string against a (big) set of indexed +strings, we return a continuation list so that, even if there are many results, +only those actually asked for are evaluated. *) + type 'a klist = [ | `Nil @@ -54,8 +67,28 @@ val klist_to_list : 'a klist -> 'a list (** Helper. *) (** {2 Signature} -We abstract over the type of characters and strings, so that we -can deal with several encodings, string representations, etc. *) + +The signature for a given string representation provides 3 main things: + +- a [edit_distance] function to compute the edit distance between strings +- an [automaton] type that is built from a string [s] and a maximum distance [n], + and only accepts the strings [s'] such that [edit_distance s s' <= n]. +- an [Index] module that can be used to map many strings to values, like + a regular string map, but for which retrieval is fuzzy (for a given + maximal distance). + +A possible use of the index could be: +[ +open Batteries;; + +let words = File.with_file_in "/usr/share/dict/english" + (fun i -> IO.read_all i |> String.nsplit ~by:"\\n");; + +let words = List.map (fun s->s,s) words;; +let idx = Levenshtein.Index.of_list words;; + +Levenshtein.Index.retrieve ~limit:1 idx "hell" |> Levenshtein.klist_to_list;; +*) module type S = sig type char_ @@ -76,14 +109,16 @@ module type S = sig (** Levenshtein automaton *) val of_string : limit:int -> string_ -> automaton - (** Build an automaton from an array, with a maximal distance [limit] *) + (** Build an automaton from a string, with a maximal distance [limit]. + The automaton will accept strings whose {!edit_distance} to the + parameter is at most [limit]. *) val of_list : limit:int -> char_ list -> automaton (** Build an automaton from a list, with a maximal distance [limit] *) val debug_print : (out_channel -> char_ -> unit) -> out_channel -> automaton -> unit - (** Output the automaton on the given channel. *) + (** Output the automaton's structure on the given channel. *) val match_with : automaton -> string_ -> bool (** [match_with a s] matches the string [s] against [a], and returns @@ -95,7 +130,7 @@ module type S = sig module Index : sig type 'b t (** Index that maps strings to values of type 'b. Internally it is - based on a trie. *) + based on a trie. A string can only map to one value. *) val empty : 'b t (** Empty index *) @@ -103,20 +138,29 @@ module type S = sig val is_empty : _ t -> bool val add : 'b t -> string_ -> 'b -> 'b t - (** Add a char array to the index. If a value was already present - for this array it is replaced. *) + (** Add a pair string/value to the index. If a value was already present + for this string it is replaced. *) val remove : 'b t -> string_ -> 'b -> 'b t - (** Remove a string from the index. *) + (** Remove a string (and its associated value, if any) from the index. *) val retrieve : limit:int -> 'b t -> string_ -> 'b klist (** Lazy list of objects associated to strings close to the query string *) val of_list : (string_ * 'b) list -> 'b t + (** Build an index from a list of pairs of strings and values *) val to_list : 'b t -> (string_ * 'b) list + (** Extract a list of pairs from an index *) - (* TODO sequence/iteration functions *) + val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a + (** Fold over the stored pairs string/value *) + + val iter : (string_ -> 'b -> unit) -> 'b t -> unit + (** Iterate on the pairs *) + + val to_klist : 'b t -> (string_ * 'b) klist + (** Conversion to an iterator *) end end