mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 19:25:28 -05:00
bugfix in the translation NDA -> DFA in Levenshtein
This commit is contained in:
parent
b6310ae17d
commit
054a83abfb
2 changed files with 66 additions and 17 deletions
|
|
@ -51,7 +51,16 @@ module type S = sig
|
||||||
type char_
|
type char_
|
||||||
type string_
|
type string_
|
||||||
|
|
||||||
(** {6 Automaton} *)
|
(** {6 Edit Distance} *)
|
||||||
|
|
||||||
|
val edit_distance : string_ -> string_ -> int
|
||||||
|
(** Edition distance between two strings. This satisfies the classical
|
||||||
|
distance axioms: it is always positive, symmetric, and satisfies
|
||||||
|
the formula [distance a b + distance b c >= distance a c] *)
|
||||||
|
|
||||||
|
(** {6 Automaton}
|
||||||
|
An automaton, built from a string [s] and a limit [n], that accepts
|
||||||
|
every string that is at distance at most [n] from [s]. *)
|
||||||
|
|
||||||
type automaton
|
type automaton
|
||||||
(** Levenshtein automaton *)
|
(** Levenshtein automaton *)
|
||||||
|
|
@ -105,6 +114,38 @@ module Make(Str : STRING) = struct
|
||||||
type string_ = Str.t
|
type string_ = Str.t
|
||||||
type char_ = Str.char_
|
type char_ = Str.char_
|
||||||
|
|
||||||
|
let edit_distance s1 s2 =
|
||||||
|
if Str.length s1 = 0
|
||||||
|
then Str.length s2
|
||||||
|
else if Str.length s2 = 0
|
||||||
|
then Str.length s1
|
||||||
|
else if s1 = s2
|
||||||
|
then 0
|
||||||
|
else begin
|
||||||
|
(* distance vectors (v0=previous, v1=current) *)
|
||||||
|
let v0 = Array.make (Str.length s2 + 1) 0 in
|
||||||
|
let v1 = Array.make (Str.length s2 + 1) 0 in
|
||||||
|
(* initialize v0: v0(i) = A(0)(i) = delete i chars from t *)
|
||||||
|
for i = 0 to Str.length s2 do
|
||||||
|
v0.(i) <- i
|
||||||
|
done;
|
||||||
|
(* main loop for the bottom up dynamic algorithm *)
|
||||||
|
for i = 0 to Str.length s1 - 1 do
|
||||||
|
(* first edit distance is the deletion of i+1 elements from s *)
|
||||||
|
v1.(0) <- i+1;
|
||||||
|
|
||||||
|
(* try add/delete/replace operations *)
|
||||||
|
for j = 0 to Str.length s2 - 1 do
|
||||||
|
let cost = if Str.compare_char (Str.get s1 i) (Str.get s2 j) = 0 then 0 else 1 in
|
||||||
|
v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost));
|
||||||
|
done;
|
||||||
|
|
||||||
|
(* copy v1 into v0 for next iteration *)
|
||||||
|
Array.blit v1 0 v0 0 (Str.length s2 + 1);
|
||||||
|
done;
|
||||||
|
v1.(Str.length s2)
|
||||||
|
end
|
||||||
|
|
||||||
module NDA = struct
|
module NDA = struct
|
||||||
type char =
|
type char =
|
||||||
| Any
|
| Any
|
||||||
|
|
@ -305,9 +346,7 @@ module Make(Str : STRING) = struct
|
||||||
get_transition_for_char nda c acc transitions
|
get_transition_for_char nda c acc transitions
|
||||||
) set NDAStateSet.empty
|
) set NDAStateSet.empty
|
||||||
in
|
in
|
||||||
let set' = saturate_epsilon nda set' in
|
saturate_epsilon nda set'
|
||||||
let is_final = NDAStateSet.exists (NDA.is_final nda) set' in
|
|
||||||
set', is_final
|
|
||||||
|
|
||||||
let follow_transition_any nda set =
|
let follow_transition_any nda set =
|
||||||
let set' = NDAStateSet.fold
|
let set' = NDAStateSet.fold
|
||||||
|
|
@ -317,9 +356,7 @@ module Make(Str : STRING) = struct
|
||||||
get_transitions_for_any nda acc transitions
|
get_transitions_for_any nda acc transitions
|
||||||
) set NDAStateSet.empty
|
) set NDAStateSet.empty
|
||||||
in
|
in
|
||||||
let set' = saturate_epsilon nda set' in
|
saturate_epsilon nda set'
|
||||||
let is_final = NDAStateSet.exists (NDA.is_final nda) set' in
|
|
||||||
set', is_final
|
|
||||||
|
|
||||||
(* call [k] with every [transition'] that can be reached from [set], with
|
(* call [k] with every [transition'] that can be reached from [set], with
|
||||||
a bool that states whether it's final *)
|
a bool that states whether it's final *)
|
||||||
|
|
@ -331,15 +368,15 @@ module Make(Str : STRING) = struct
|
||||||
(fun c ->
|
(fun c ->
|
||||||
(*Printf.printf "iterate_transition follows %c (at %s)\n"
|
(*Printf.printf "iterate_transition follows %c (at %s)\n"
|
||||||
(Obj.magic c) (set_to_string set);*)
|
(Obj.magic c) (set_to_string set);*)
|
||||||
let set', is_final = follow_transition nda set c in
|
let set' = follow_transition nda set c in
|
||||||
if not (NDAStateSet.is_empty set')
|
if not (NDAStateSet.is_empty set')
|
||||||
then k ~is_final (NDA.Char c) set';
|
then k (NDA.Char c) set';
|
||||||
) chars;
|
) chars;
|
||||||
(* remaining transitions, with only "Any" *)
|
(* remaining transitions, with only "Any" *)
|
||||||
(*Printf.printf "iterate transition follows * (at %s)\n" (set_to_string set);*)
|
(*Printf.printf "iterate transition follows * (at %s)\n" (set_to_string set);*)
|
||||||
let set', is_final = follow_transition_any nda set in
|
let set' = follow_transition_any nda set in
|
||||||
if not (NDAStateSet.is_empty set')
|
if not (NDAStateSet.is_empty set')
|
||||||
then k ~is_final NDA.Any set'
|
then k NDA.Any set'
|
||||||
|
|
||||||
module StateSetMap = Map.Make(NDAStateSet)
|
module StateSetMap = Map.Make(NDAStateSet)
|
||||||
|
|
||||||
|
|
@ -355,15 +392,16 @@ module Make(Str : STRING) = struct
|
||||||
(* traverse the NDA. Currently we're at [set] *)
|
(* traverse the NDA. Currently we're at [set] *)
|
||||||
let rec traverse nda dfa states set =
|
let rec traverse nda dfa states set =
|
||||||
let set_i = get_state dfa states set in
|
let set_i = get_state dfa states set in
|
||||||
|
(* does this set lead to success? *)
|
||||||
|
let is_final = NDAStateSet.exists (NDA.is_final nda) set in
|
||||||
|
if is_final
|
||||||
|
then set_final dfa set_i;
|
||||||
iterate_transition_set nda set
|
iterate_transition_set nda set
|
||||||
(fun ~is_final c set' ->
|
(fun c set' ->
|
||||||
(*Printf.printf "traverse %s --%c--> %s\n" (set_to_string set)
|
(*Printf.printf "traverse %s --%c--> %s\n" (set_to_string set)
|
||||||
(match c with NDA.Char c' -> Obj.magic c' | NDA.Any -> '*')
|
(match c with NDA.Char c' -> Obj.magic c' | NDA.Any -> '*')
|
||||||
(set_to_string set');*)
|
(set_to_string set');*)
|
||||||
let set_i' = get_state dfa states set' in
|
let set_i' = get_state dfa states set' in
|
||||||
(* did we reach success? *)
|
|
||||||
if is_final
|
|
||||||
then set_final dfa set_i';
|
|
||||||
(* link set -> set' *)
|
(* link set -> set' *)
|
||||||
match c with
|
match c with
|
||||||
| NDA.Char c' ->
|
| NDA.Char c' ->
|
||||||
|
|
|
||||||
|
|
@ -53,13 +53,24 @@ type 'a klist =
|
||||||
val klist_to_list : 'a klist -> 'a list
|
val klist_to_list : 'a klist -> 'a list
|
||||||
(** Helper. *)
|
(** Helper. *)
|
||||||
|
|
||||||
(** {2 Signature} *)
|
(** {2 Signature}
|
||||||
|
We abstract over the type of characters and strings, so that we
|
||||||
|
can deal with several encodings, string representations, etc. *)
|
||||||
|
|
||||||
module type S = sig
|
module type S = sig
|
||||||
type char_
|
type char_
|
||||||
type string_
|
type string_
|
||||||
|
|
||||||
(** {6 Automaton} *)
|
(** {6 Edit Distance} *)
|
||||||
|
|
||||||
|
val edit_distance : string_ -> string_ -> int
|
||||||
|
(** Edition distance between two strings. This satisfies the classical
|
||||||
|
distance axioms: it is always positive, symmetric, and satisfies
|
||||||
|
the formula [distance a b + distance b c >= distance a c] *)
|
||||||
|
|
||||||
|
(** {6 Automaton}
|
||||||
|
An automaton, built from a string [s] and a limit [n], that accepts
|
||||||
|
every string that is at distance at most [n] from [s]. *)
|
||||||
|
|
||||||
type automaton
|
type automaton
|
||||||
(** Levenshtein automaton *)
|
(** Levenshtein automaton *)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue