ocaml-containers/string/levenshtein.mli

178 lines
6 KiB
OCaml

(*
copyright (c) 2013, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Levenshtein distance}
We take inspiration from
http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
for the main algorithm and ideas. However some parts are adapted *)
(** {2 Abstraction over Strings}
Due to the existence of several encodings and string representations we
abstract over the type of strings. A string is a finite array of characters
(8-bits char, unicode runes, etc.) which provides a length operation
and a function to access the n-th character. *)
module type STRING = sig
type char_
type t
val of_list : char_ list -> t
val get : t -> int -> char_
val length : t -> int
val compare_char : char_ -> char_ -> int
end
(** {2 Continuation list}
This data structure is used to represent a list of result that is
evaluated only as far as the user wants. If the user only wants a few elements,
she doesn't pay for the remaining ones.
In particular, when matching a string against a (big) set of indexed
strings, we return a continuation list so that, even if there are many results,
only those actually asked for are evaluated. *)
type 'a klist =
[
| `Nil
| `Cons of 'a * (unit -> 'a klist)
]
val klist_to_list : 'a klist -> 'a list
(** Helper. *)
(** {2 Signature}
The signature for a given string representation provides 3 main things:
- a [edit_distance] function to compute the edit distance between strings
- an [automaton] type that is built from a string [s] and a maximum distance [n],
and only accepts the strings [s'] such that [edit_distance s s' <= n].
- an [Index] module that can be used to map many strings to values, like
a regular string map, but for which retrieval is fuzzy (for a given
maximal distance).
A possible use of the index could be:
{[
open Batteries;;
let words = File.with_file_in "/usr/share/dict/english"
(fun i -> IO.read_all i |> String.nsplit ~by:"\\n");;
let words = List.map (fun s->s,s) words;;
let idx = Levenshtein.Index.of_list words;;
Levenshtein.Index.retrieve ~limit:1 idx "hell" |> Levenshtein.klist_to_list;;
]}
*)
module type S = sig
type char_
type string_
(** {6 Edit Distance} *)
val edit_distance : string_ -> string_ -> int
(** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c] *)
(** {6 Automaton}
An automaton, built from a string [s] and a limit [n], that accepts
every string that is at distance at most [n] from [s]. *)
type automaton
(** Levenshtein automaton *)
val of_string : limit:int -> string_ -> automaton
(** Build an automaton from a string, with a maximal distance [limit].
The automaton will accept strings whose {!edit_distance} to the
parameter is at most [limit]. *)
val of_list : limit:int -> char_ list -> automaton
(** Build an automaton from a list, with a maximal distance [limit] *)
val debug_print : (out_channel -> char_ -> unit) ->
out_channel -> automaton -> unit
(** Output the automaton's structure on the given channel. *)
val match_with : automaton -> string_ -> bool
(** [match_with a s] matches the string [s] against [a], and returns
[true] if the distance from [s] to the word represented by [a] is smaller
than the limit used to build [a] *)
(** {6 Index for one-to-many matching} *)
module Index : sig
type 'b t
(** Index that maps strings to values of type 'b. Internally it is
based on a trie. A string can only map to one value. *)
val empty : 'b t
(** Empty index *)
val is_empty : _ t -> bool
val add : 'b t -> string_ -> 'b -> 'b t
(** Add a pair string/value to the index. If a value was already present
for this string it is replaced. *)
val remove : 'b t -> string_ -> 'b -> 'b t
(** Remove a string (and its associated value, if any) from the index. *)
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
(** Lazy list of objects associated to strings close to the query string *)
val of_list : (string_ * 'b) list -> 'b t
(** Build an index from a list of pairs of strings and values *)
val to_list : 'b t -> (string_ * 'b) list
(** Extract a list of pairs from an index *)
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
(** Fold over the stored pairs string/value *)
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
(** Iterate on the pairs *)
val to_klist : 'b t -> (string_ * 'b) klist
(** Conversion to an iterator *)
end
end
(** {2 Functor} *)
module Make(Str : STRING) : S
with type string_ = Str.t
and type char_ = Str.char_
(** {2 Default instance: string} *)
include S with type char_ = char and type string_ = string
val debug_print : out_channel -> automaton -> unit