mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 03:05:28 -05:00
Levenshtein automaton for fuzzy string searching
This commit is contained in:
parent
a2e0d85dca
commit
387ec86560
7 changed files with 401 additions and 8 deletions
2
_oasis
2
_oasis
|
|
@ -40,7 +40,7 @@ Library "containers"
|
|||
Vector, Bij, PiCalculus, Bencode, Sexp, RAL, MultiSet,
|
||||
UnionFind, SmallSet, Leftistheap, AbsSet, CSM, MultiMap,
|
||||
ActionMan, BV, QCheck, BencodeOnDisk, Show, TTree,
|
||||
HGraph, Automaton, Conv
|
||||
HGraph, Automaton, Conv, Levenshtein
|
||||
BuildDepends: unix
|
||||
|
||||
Library "containers_thread"
|
||||
|
|
|
|||
3
_tags
3
_tags
|
|
@ -1,5 +1,5 @@
|
|||
# OASIS_START
|
||||
# DO NOT EDIT (digest: dcba3d944e29e7f7be0fecd41665b91e)
|
||||
# DO NOT EDIT (digest: 47b2b524d1884245470e1a1a6acb3e85)
|
||||
# Ignore VCS directories, you can use the same kind of rule outside
|
||||
# OASIS_START/STOP if you want to exclude directories that contains
|
||||
# useless stuff for the build process
|
||||
|
|
@ -54,6 +54,7 @@
|
|||
"hGraph.cmx": for-pack(Containers)
|
||||
"automaton.cmx": for-pack(Containers)
|
||||
"conv.cmx": for-pack(Containers)
|
||||
"levenshtein.cmx": for-pack(Containers)
|
||||
# Library containers_thread
|
||||
"threads/containers_thread.cmxs": use_containers_thread
|
||||
<threads/*.ml{,i}>: use_containers
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# OASIS_START
|
||||
# DO NOT EDIT (digest: ff5363e0f2b41f49070889cafc95f37b)
|
||||
# DO NOT EDIT (digest: 38e5fd431c40c173ee89b060fc69bed7)
|
||||
Cache
|
||||
Deque
|
||||
Gen
|
||||
|
|
@ -39,4 +39,5 @@ TTree
|
|||
HGraph
|
||||
Automaton
|
||||
Conv
|
||||
Levenshtein
|
||||
# OASIS_STOP
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
# OASIS_START
|
||||
# DO NOT EDIT (digest: ff5363e0f2b41f49070889cafc95f37b)
|
||||
# DO NOT EDIT (digest: 38e5fd431c40c173ee89b060fc69bed7)
|
||||
Cache
|
||||
Deque
|
||||
Gen
|
||||
|
|
@ -39,4 +39,5 @@ TTree
|
|||
HGraph
|
||||
Automaton
|
||||
Conv
|
||||
Levenshtein
|
||||
# OASIS_STOP
|
||||
|
|
|
|||
335
levenshtein.ml
Normal file
335
levenshtein.ml
Normal file
|
|
@ -0,0 +1,335 @@
|
|||
|
||||
(*
|
||||
copyright (c) 2013, simon cruanes
|
||||
all rights reserved.
|
||||
|
||||
redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer. redistributions in binary
|
||||
form must reproduce the above copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other materials provided with
|
||||
the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*)
|
||||
|
||||
|
||||
(** {1 Levenshtein distance} *)
|
||||
|
||||
module NDA = struct
|
||||
type 'a char =
|
||||
| Any
|
||||
| Char of 'a
|
||||
type 'a transition =
|
||||
| Success
|
||||
| Upon of 'a char * int * int
|
||||
| Epsilon of int * int
|
||||
|
||||
(* non deterministic automaton *)
|
||||
type 'a t = {
|
||||
compare : 'a -> 'a -> int;
|
||||
matrix : 'a transition list array array;
|
||||
}
|
||||
|
||||
let length nda = Array.length nda.matrix
|
||||
|
||||
let get_compare nda = nda.compare
|
||||
|
||||
(* build NDA from the "get : int -> 'a" function *)
|
||||
let make ~compare ~limit ~len ~get =
|
||||
let m = Array.make_matrix len limit [] in
|
||||
let add_transition i j tr =
|
||||
m.(i).(j) <- tr :: m.(i).(j)
|
||||
in
|
||||
(* internal transitions *)
|
||||
for i = 0 to len-1 do
|
||||
for j = 0 to limit do
|
||||
(* correct char *)
|
||||
add_transition i j (Upon (Char (get i), i+1, j));
|
||||
(* other transitions *)
|
||||
if j < limit then begin
|
||||
(* substitution *)
|
||||
add_transition i j (Upon (Any, i+1, j+1));
|
||||
(* deletion in indexed string *)
|
||||
add_transition i j (Upon (Any, i, j+1));
|
||||
(* addition to indexed string *)
|
||||
add_transition i j (Epsilon (i+1, j+1));
|
||||
end
|
||||
done
|
||||
done;
|
||||
for j = 0 to limit do
|
||||
(* deletions at the end *)
|
||||
if j < limit
|
||||
then add_transition (len-1) j (Upon (Any, len-1, j+1));
|
||||
(* win in any case *)
|
||||
add_transition (len-1) j Success;
|
||||
done;
|
||||
{ matrix=m; compare; }
|
||||
|
||||
let get nda (i,j) =
|
||||
nda.matrix.(i).(j)
|
||||
end
|
||||
|
||||
(** deterministic automaton *)
|
||||
module DFA = struct
|
||||
type 'a transition =
|
||||
| Success
|
||||
| Upon of 'a * int (* transition to state i *)
|
||||
| Otherwise of int (* transition to state i *)
|
||||
|
||||
type 'a t = {
|
||||
compare : 'a -> 'a -> int;
|
||||
mutable transitions : 'a transition list array;
|
||||
mutable len : int;
|
||||
}
|
||||
|
||||
let create ~compare size = {
|
||||
compare;
|
||||
len = 0;
|
||||
transitions = Array.make size [];
|
||||
}
|
||||
|
||||
(* add a new state *)
|
||||
let add_state dfa =
|
||||
let n = dfa.len in
|
||||
(* resize *)
|
||||
if n = Array.length dfa.transitions then begin
|
||||
let a' = Array.make (2*n) [] in
|
||||
Array.blit dfa.transitions 0 a' 0 n;
|
||||
dfa.transitions <- a'
|
||||
end;
|
||||
dfa.len <- n + 1;
|
||||
n
|
||||
|
||||
(* add transition *)
|
||||
let add_transition dfa i tr =
|
||||
dfa.transitions.(i) <- tr :: dfa.transitions.(i)
|
||||
|
||||
(* set of pairs of ints: used for representing a set of states of the NDA *)
|
||||
module NDAStateSet = Set.Make(struct
|
||||
type t = int * int
|
||||
let compare = Pervasives.compare
|
||||
end)
|
||||
|
||||
(* list of characters that can specifically be followed from the given set *)
|
||||
let chars_from_set nda set =
|
||||
NDAStateSet.fold
|
||||
(fun state acc ->
|
||||
let transitions = NDA.get nda state in
|
||||
List.fold_left
|
||||
(fun acc tr -> match tr with
|
||||
| NDA.Upon (NDA.Char c, _, _) ->
|
||||
if List.exists (fun c' -> nda.NDA.compare c c' = 0) acc
|
||||
then acc
|
||||
else c :: acc (* new char! *)
|
||||
| _ -> acc
|
||||
) acc transitions
|
||||
) set []
|
||||
|
||||
(* saturate current set w.r.t epsilon links *)
|
||||
let saturate_epsilon nda set =
|
||||
let q = Queue.create () in
|
||||
NDAStateSet.iter (fun s -> Queue.push s q) set;
|
||||
let set = ref set in
|
||||
while not (Queue.is_empty q) do
|
||||
let state = Queue.pop q in
|
||||
List.iter
|
||||
(fun tr' -> match tr' with
|
||||
| NDA.Epsilon (i,j) ->
|
||||
if not (NDAStateSet.mem (i,j) !set)
|
||||
then (set := NDAStateSet.add (i,j) !set; Queue.push (i,j) q)
|
||||
| _ -> ()
|
||||
) (NDA.get nda state)
|
||||
done;
|
||||
!set
|
||||
|
||||
exception LeadToSuccess
|
||||
|
||||
(* find the transition that matches the given char (if any);
|
||||
may raise exceptions Not_found or LeadToSuccess. *)
|
||||
let rec get_transition_for_char nda c transitions =
|
||||
match c, transitions with
|
||||
| _, NDA.Success::_ -> raise LeadToSuccess
|
||||
| NDA.Char c', NDA.Upon (NDA.Char c'', i, j) :: transitions' ->
|
||||
if nda.NDA.compare c' c'' = 0
|
||||
then i, j
|
||||
else get_transition_for_char nda c transitions'
|
||||
| NDA.Any, NDA.Upon (NDA.Any, i, j) :: _ -> i, j
|
||||
| _, NDA.Upon (NDA.Any, i, j) :: transitions' ->
|
||||
begin try get_transition_for_char nda c transitions'
|
||||
with Not_found -> i, j (* only if no other transition works *)
|
||||
end
|
||||
| _, _::transitions' -> get_transition_for_char nda c transitions'
|
||||
| _, [] -> raise Not_found
|
||||
|
||||
(* follow transition for given NDA.char, returns a new state
|
||||
and a boolean indicating whether it's final *)
|
||||
let follow_transition nda set c =
|
||||
let is_final = ref false in
|
||||
let set' = NDAStateSet.fold
|
||||
(fun state acc ->
|
||||
(* possible transitions *)
|
||||
let transitions = NDA.get nda state in
|
||||
try
|
||||
let state' = get_transition_for_char nda c transitions in
|
||||
NDAStateSet.add state' acc
|
||||
with
|
||||
| LeadToSuccess -> is_final := true; acc
|
||||
| Not_found -> acc (* state dies *)
|
||||
) set NDAStateSet.empty
|
||||
in
|
||||
let set' = saturate_epsilon nda set' in
|
||||
set', !is_final
|
||||
|
||||
(* only follow "Any" transitions *)
|
||||
let follow_other_transition nda set =
|
||||
let is_final = ref false in
|
||||
let set' = NDAStateSet.fold
|
||||
(fun state acc ->
|
||||
(* possible transitions *)
|
||||
let transitions = NDA.get nda state in
|
||||
try
|
||||
let state' = get_transition_for_char nda NDA.Any transitions in
|
||||
NDAStateSet.add state' acc
|
||||
with
|
||||
| LeadToSuccess -> is_final := true; acc
|
||||
| Not_found -> acc (* state dies *)
|
||||
) set NDAStateSet.empty
|
||||
in
|
||||
let set' = saturate_epsilon nda set' in
|
||||
set', !is_final
|
||||
|
||||
(* call [k] with every [transition'] that can be reached from [set], with
|
||||
a bool that states whether it's final *)
|
||||
let iterate_transition_set nda set k =
|
||||
(* all possible "fixed char" transitions *)
|
||||
let chars = chars_from_set nda set in
|
||||
List.iter
|
||||
(fun c ->
|
||||
let set', is_final = follow_transition nda set (NDA.Char c) in
|
||||
k ~is_final (NDA.Char c) set')
|
||||
chars;
|
||||
(* remaining transitions, with only "Any" *)
|
||||
let set', is_final = follow_other_transition nda set in
|
||||
k ~is_final NDA.Any set'
|
||||
|
||||
module StateSetMap = Map.Make(NDAStateSet)
|
||||
|
||||
(* get the state that corresponds to the given set of NDA states.
|
||||
[states] is a map [nda set] -> [nfa state] *)
|
||||
let get_state dfa states set =
|
||||
try StateSetMap.find set !states
|
||||
with Not_found ->
|
||||
let i = add_state dfa in
|
||||
states := StateSetMap.add set i !states;
|
||||
i
|
||||
|
||||
(* traverse the NDA. Currently we're at [set] *)
|
||||
let rec traverse nda dfa states set =
|
||||
let set = saturate_epsilon nda set in
|
||||
let set_i = get_state dfa states set in
|
||||
iterate_transition_set nda set
|
||||
(fun ~is_final c set' ->
|
||||
let set'_i = get_state dfa states set' in
|
||||
(* did we reach success? *)
|
||||
if is_final
|
||||
then add_transition dfa set'_i Success;
|
||||
(* link set -> set' *)
|
||||
match c with
|
||||
| NDA.Any ->
|
||||
add_transition dfa set_i (Otherwise set'_i)
|
||||
| NDA.Char c' ->
|
||||
add_transition dfa set_i (Upon (c', set'_i))
|
||||
)
|
||||
|
||||
let of_nda nda =
|
||||
let compare = NDA.get_compare nda in
|
||||
let dfa = create ~compare (NDA.length nda) in
|
||||
(* map (set of NDA states) to int (state in DFA) *)
|
||||
let states = ref StateSetMap.empty in
|
||||
(* traverse the NDA to build the NFA *)
|
||||
traverse nda dfa states (NDAStateSet.singleton (0,0));
|
||||
dfa
|
||||
|
||||
let get dfa i =
|
||||
dfa.transitions.(i)
|
||||
end
|
||||
|
||||
type 'a automaton = 'a DFA.t
|
||||
|
||||
let of_array ?(compare=Pervasives.compare) ~limit a =
|
||||
let nda = NDA.make ~compare ~limit ~len:(Array.length a) ~get:(Array.get a) in
|
||||
let dfa = DFA.of_nda nda in
|
||||
dfa
|
||||
|
||||
let of_list ?compare ~limit l =
|
||||
of_array ?compare ~limit (Array.of_list l)
|
||||
|
||||
let of_string ~limit a =
|
||||
let compare = Char.compare in
|
||||
let nda = NDA.make ~compare ~limit ~len:(String.length a) ~get:(String.get a) in
|
||||
let dfa = DFA.of_nda nda in
|
||||
dfa
|
||||
|
||||
type match_result =
|
||||
| TooFar
|
||||
| Distance of int
|
||||
|
||||
exception FoundDistance of int
|
||||
|
||||
let rec __has_success = function
|
||||
| [] -> false
|
||||
| DFA.Success :: _ -> true
|
||||
| _ :: l' -> __has_success l'
|
||||
|
||||
let rec __find_char ~compare c l k = match l with
|
||||
| [] -> ()
|
||||
| DFA.Upon (c', next) :: l' ->
|
||||
if compare c c' = 0
|
||||
then k next
|
||||
else __find_char ~compare c l' k
|
||||
| _ :: l' -> __find_char ~compare c l' k
|
||||
|
||||
let rec __find_otherwise l k = match l with
|
||||
| [] -> ()
|
||||
| DFA.Otherwise next :: _ -> k next
|
||||
| _::l' -> __find_otherwise l' k
|
||||
|
||||
(* real matching function *)
|
||||
let __match ~len ~get dfa =
|
||||
let rec search ~dist i state =
|
||||
if i = len then raise (FoundDistance dist)
|
||||
else begin
|
||||
let transitions = DFA.get dfa state in
|
||||
if __has_success transitions
|
||||
then raise (FoundDistance dist);
|
||||
(* current char *)
|
||||
let c = get i in
|
||||
__find_char ~compare:dfa.DFA.compare c transitions
|
||||
(fun next -> search ~dist (i+1) next);
|
||||
__find_otherwise transitions
|
||||
(fun next -> search ~dist:(dist+1) (i+1) next);
|
||||
end
|
||||
in
|
||||
try
|
||||
search ~dist:0 0 0;
|
||||
TooFar
|
||||
with FoundDistance i ->
|
||||
Distance i
|
||||
|
||||
let match_with dfa a =
|
||||
__match ~len:(Array.length a) ~get:(Array.get a) dfa
|
||||
|
||||
let match_with_string dfa s =
|
||||
__match ~len:(String.length s) ~get:(String.get s) dfa
|
||||
53
levenshtein.mli
Normal file
53
levenshtein.mli
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
(*
|
||||
copyright (c) 2013, simon cruanes
|
||||
all rights reserved.
|
||||
|
||||
redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer. redistributions in binary
|
||||
form must reproduce the above copyright notice, this list of conditions and the
|
||||
following disclaimer in the documentation and/or other materials provided with
|
||||
the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*)
|
||||
|
||||
|
||||
(** {1 Levenshtein distance} *)
|
||||
|
||||
(** {2 Automaton} *)
|
||||
|
||||
type 'a automaton
|
||||
(** Levenshtein automaton for characters of type 'a *)
|
||||
|
||||
val of_array : ?compare:('a -> 'a -> int) -> limit:int -> 'a array -> 'a automaton
|
||||
(** Build an automaton from an array, with a maximal distance [limit] *)
|
||||
|
||||
val of_list : ?compare:('a -> 'a -> int) -> limit:int -> 'a list -> 'a automaton
|
||||
(** Build an automaton from a list, with a maximal distance [limit] *)
|
||||
|
||||
val of_string : limit:int -> string -> char automaton
|
||||
(** Automaton for the special case of strings *)
|
||||
|
||||
type match_result =
|
||||
| TooFar
|
||||
| Distance of int
|
||||
|
||||
val match_with : 'a automaton -> 'a array -> match_result
|
||||
(** [match_with a s] matches the string [s] against [a], and returns
|
||||
the distance from [s] to the word represented by [a] *)
|
||||
|
||||
val match_with_string : char automaton -> string -> match_result
|
||||
(** Specialized version of {!match_with} for strings *)
|
||||
10
setup.ml
10
setup.ml
|
|
@ -1,7 +1,7 @@
|
|||
(* setup.ml generated for the first time by OASIS v0.3.0 *)
|
||||
|
||||
(* OASIS_START *)
|
||||
(* DO NOT EDIT (digest: 70d10cce7646b7db6e9097bd53a36898) *)
|
||||
(* DO NOT EDIT (digest: 6434f259ac3da73977f0b444b8699a51) *)
|
||||
(*
|
||||
Regenerated by OASIS v0.4.0
|
||||
Visit http://oasis.forge.ocamlcore.org for more information and
|
||||
|
|
@ -6886,7 +6886,8 @@ let setup_t =
|
|||
"TTree";
|
||||
"HGraph";
|
||||
"Automaton";
|
||||
"Conv"
|
||||
"Conv";
|
||||
"Levenshtein"
|
||||
];
|
||||
lib_pack = true;
|
||||
lib_internal_modules = [];
|
||||
|
|
@ -7240,7 +7241,8 @@ let setup_t =
|
|||
};
|
||||
oasis_fn = Some "_oasis";
|
||||
oasis_version = "0.4.0";
|
||||
oasis_digest = Some "r\230rE\018\166\247N\216N\001k\0214\177\204";
|
||||
oasis_digest =
|
||||
Some "L\140\171S\252\246\195\233`\170D\014\135\031\000\162";
|
||||
oasis_exec = None;
|
||||
oasis_setup_args = [];
|
||||
setup_update = false
|
||||
|
|
@ -7248,6 +7250,6 @@ let setup_t =
|
|||
|
||||
let setup () = BaseSetup.setup setup_t;;
|
||||
|
||||
# 7252 "setup.ml"
|
||||
# 7254 "setup.ml"
|
||||
(* OASIS_STOP *)
|
||||
let () = setup ();;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue