mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 03:05:28 -05:00
iteration over Levenshtein indexes
This commit is contained in:
parent
49dd2f9054
commit
4a68729485
2 changed files with 102 additions and 21 deletions
|
|
@ -66,14 +66,16 @@ module type S = sig
|
|||
(** Levenshtein automaton *)
|
||||
|
||||
val of_string : limit:int -> string_ -> automaton
|
||||
(** Build an automaton from an array, with a maximal distance [limit] *)
|
||||
(** Build an automaton from a string, with a maximal distance [limit].
|
||||
The automaton will accept strings whose {!edit_distance} to the
|
||||
parameter is at most [limit]. *)
|
||||
|
||||
val of_list : limit:int -> char_ list -> automaton
|
||||
(** Build an automaton from a list, with a maximal distance [limit] *)
|
||||
|
||||
val debug_print : (out_channel -> char_ -> unit) ->
|
||||
out_channel -> automaton -> unit
|
||||
(** Output the automaton on the given channel. *)
|
||||
(** Output the automaton's structure on the given channel. *)
|
||||
|
||||
val match_with : automaton -> string_ -> bool
|
||||
(** [match_with a s] matches the string [s] against [a], and returns
|
||||
|
|
@ -85,7 +87,7 @@ module type S = sig
|
|||
module Index : sig
|
||||
type 'b t
|
||||
(** Index that maps strings to values of type 'b. Internally it is
|
||||
based on a trie. *)
|
||||
based on a trie. A string can only map to one value. *)
|
||||
|
||||
val empty : 'b t
|
||||
(** Empty index *)
|
||||
|
|
@ -93,20 +95,29 @@ module type S = sig
|
|||
val is_empty : _ t -> bool
|
||||
|
||||
val add : 'b t -> string_ -> 'b -> 'b t
|
||||
(** Add a char array to the index. If a value was already present
|
||||
for this array it is replaced. *)
|
||||
(** Add a pair string/value to the index. If a value was already present
|
||||
for this string it is replaced. *)
|
||||
|
||||
val remove : 'b t -> string_ -> 'b -> 'b t
|
||||
(** Remove a string from the index. *)
|
||||
(** Remove a string (and its associated value, if any) from the index. *)
|
||||
|
||||
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
|
||||
(** Lazy list of objects associated to strings close to the query string *)
|
||||
|
||||
val of_list : (string_ * 'b) list -> 'b t
|
||||
(** Build an index from a list of pairs of strings and values *)
|
||||
|
||||
val to_list : 'b t -> (string_ * 'b) list
|
||||
(** Extract a list of pairs from an index *)
|
||||
|
||||
(* TODO sequence/iteration functions *)
|
||||
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
|
||||
(** Fold over the stored pairs string/value *)
|
||||
|
||||
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
|
||||
(** Iterate on the pairs *)
|
||||
|
||||
val to_klist : 'b t -> (string_ * 'b) klist
|
||||
(** Conversion to an iterator *)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
@ -589,19 +600,45 @@ module Make(Str : STRING) = struct
|
|||
(fun acc (arr,v) -> add acc arr v)
|
||||
empty l
|
||||
|
||||
let to_list idx =
|
||||
let fold f acc idx =
|
||||
let rec explore acc trail node = match node with
|
||||
| Node (opt, m) ->
|
||||
(* first, yield current value, if any *)
|
||||
let acc = match opt with
|
||||
| None -> acc
|
||||
| Some v -> (Str.of_list (List.rev trail), v) :: acc
|
||||
| Some v ->
|
||||
let str = Str.of_list (List.rev trail) in
|
||||
f acc str v
|
||||
in
|
||||
M.fold
|
||||
(fun c node' acc -> explore acc (c::trail) node')
|
||||
m acc
|
||||
in
|
||||
explore [] [] idx
|
||||
explore acc [] idx
|
||||
|
||||
let iter f idx =
|
||||
fold (fun () str v -> f str v) () idx
|
||||
|
||||
let to_list idx =
|
||||
fold (fun acc str v -> (str,v) :: acc) [] idx
|
||||
|
||||
let to_klist idx =
|
||||
let rec traverse node trail ~(fk:unit->(string_*'a) klist) =
|
||||
match node with
|
||||
| Node (opt, m) ->
|
||||
(* all alternatives: continue exploring [m], or call [fk] *)
|
||||
let fk =
|
||||
M.fold
|
||||
(fun c node' fk () -> traverse node' (c::trail) ~fk)
|
||||
m fk
|
||||
in
|
||||
match opt with
|
||||
| Some v ->
|
||||
let str = Str.of_list (List.rev trail) in
|
||||
`Cons ((str,v), fk)
|
||||
| _ -> fk () (* fail... or explore subtrees *)
|
||||
in
|
||||
traverse idx [] ~fk:(fun () -> `Nil)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
|||
|
|
@ -31,7 +31,11 @@ We take inspiration from
|
|||
http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
|
||||
for the main algorithm and ideas. However some parts are adapted *)
|
||||
|
||||
(** {2 Abstraction over Strings} *)
|
||||
(** {2 Abstraction over Strings}
|
||||
Due to the existence of several encodings and string representations we
|
||||
abstract over the type of strings. A string is a finite array of characters
|
||||
(8-bits char, unicode runes, etc.) which provides a length operation
|
||||
and a function to access the n-th character. *)
|
||||
|
||||
module type STRING = sig
|
||||
type char_
|
||||
|
|
@ -43,7 +47,16 @@ module type STRING = sig
|
|||
val compare_char : char_ -> char_ -> int
|
||||
end
|
||||
|
||||
(** {2 Continuation list} *)
|
||||
(** {2 Continuation list}
|
||||
|
||||
This data structure is used to represent a list of result that is
|
||||
evaluated only as far as the user wants. If the user only wants a few elements,
|
||||
she doesn't pay for the remaining ones.
|
||||
|
||||
In particular, when matching a string against a (big) set of indexed
|
||||
strings, we return a continuation list so that, even if there are many results,
|
||||
only those actually asked for are evaluated. *)
|
||||
|
||||
type 'a klist =
|
||||
[
|
||||
| `Nil
|
||||
|
|
@ -54,8 +67,28 @@ val klist_to_list : 'a klist -> 'a list
|
|||
(** Helper. *)
|
||||
|
||||
(** {2 Signature}
|
||||
We abstract over the type of characters and strings, so that we
|
||||
can deal with several encodings, string representations, etc. *)
|
||||
|
||||
The signature for a given string representation provides 3 main things:
|
||||
|
||||
- a [edit_distance] function to compute the edit distance between strings
|
||||
- an [automaton] type that is built from a string [s] and a maximum distance [n],
|
||||
and only accepts the strings [s'] such that [edit_distance s s' <= n].
|
||||
- an [Index] module that can be used to map many strings to values, like
|
||||
a regular string map, but for which retrieval is fuzzy (for a given
|
||||
maximal distance).
|
||||
|
||||
A possible use of the index could be:
|
||||
[
|
||||
open Batteries;;
|
||||
|
||||
let words = File.with_file_in "/usr/share/dict/english"
|
||||
(fun i -> IO.read_all i |> String.nsplit ~by:"\\n");;
|
||||
|
||||
let words = List.map (fun s->s,s) words;;
|
||||
let idx = Levenshtein.Index.of_list words;;
|
||||
|
||||
Levenshtein.Index.retrieve ~limit:1 idx "hell" |> Levenshtein.klist_to_list;;
|
||||
*)
|
||||
|
||||
module type S = sig
|
||||
type char_
|
||||
|
|
@ -76,14 +109,16 @@ module type S = sig
|
|||
(** Levenshtein automaton *)
|
||||
|
||||
val of_string : limit:int -> string_ -> automaton
|
||||
(** Build an automaton from an array, with a maximal distance [limit] *)
|
||||
(** Build an automaton from a string, with a maximal distance [limit].
|
||||
The automaton will accept strings whose {!edit_distance} to the
|
||||
parameter is at most [limit]. *)
|
||||
|
||||
val of_list : limit:int -> char_ list -> automaton
|
||||
(** Build an automaton from a list, with a maximal distance [limit] *)
|
||||
|
||||
val debug_print : (out_channel -> char_ -> unit) ->
|
||||
out_channel -> automaton -> unit
|
||||
(** Output the automaton on the given channel. *)
|
||||
(** Output the automaton's structure on the given channel. *)
|
||||
|
||||
val match_with : automaton -> string_ -> bool
|
||||
(** [match_with a s] matches the string [s] against [a], and returns
|
||||
|
|
@ -95,7 +130,7 @@ module type S = sig
|
|||
module Index : sig
|
||||
type 'b t
|
||||
(** Index that maps strings to values of type 'b. Internally it is
|
||||
based on a trie. *)
|
||||
based on a trie. A string can only map to one value. *)
|
||||
|
||||
val empty : 'b t
|
||||
(** Empty index *)
|
||||
|
|
@ -103,20 +138,29 @@ module type S = sig
|
|||
val is_empty : _ t -> bool
|
||||
|
||||
val add : 'b t -> string_ -> 'b -> 'b t
|
||||
(** Add a char array to the index. If a value was already present
|
||||
for this array it is replaced. *)
|
||||
(** Add a pair string/value to the index. If a value was already present
|
||||
for this string it is replaced. *)
|
||||
|
||||
val remove : 'b t -> string_ -> 'b -> 'b t
|
||||
(** Remove a string from the index. *)
|
||||
(** Remove a string (and its associated value, if any) from the index. *)
|
||||
|
||||
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
|
||||
(** Lazy list of objects associated to strings close to the query string *)
|
||||
|
||||
val of_list : (string_ * 'b) list -> 'b t
|
||||
(** Build an index from a list of pairs of strings and values *)
|
||||
|
||||
val to_list : 'b t -> (string_ * 'b) list
|
||||
(** Extract a list of pairs from an index *)
|
||||
|
||||
(* TODO sequence/iteration functions *)
|
||||
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
|
||||
(** Fold over the stored pairs string/value *)
|
||||
|
||||
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
|
||||
(** Iterate on the pairs *)
|
||||
|
||||
val to_klist : 'b t -> (string_ * 'b) klist
|
||||
(** Conversion to an iterator *)
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue