feat(string): add optional cutoff arg on String.edit_distance

also add more tests
This commit is contained in:
Simon Cruanes 2020-04-30 22:19:49 -04:00
parent d99d35cc70
commit b26021a976
3 changed files with 45 additions and 10 deletions

View file

@ -589,18 +589,19 @@ let compare_natural a b =
then compare_natural a c < 0 else Q.assume_fail()) then compare_natural a c < 0 else Q.assume_fail())
*) *)
let edit_distance s1 s2 = let edit_distance ?(cutoff=max_int) s1 s2 =
if length s1 = 0 if length s1 = 0
then length s2 then min cutoff (length s2)
else if length s2 = 0 else if length s2 = 0
then length s1 then min cutoff (length s1)
else if equal s1 s2 else if equal s1 s2
then 0 then 0
else begin else try
(* distance vectors (v0=previous, v1=current) *) (* distance vectors (v0=previous, v1=current) *)
let v0 = Array.make (length s2 + 1) 0 in let v0 = Array.make (length s2 + 1) 0 in
let v1 = Array.make (length s2 + 1) 0 in let v1 = Array.make (length s2 + 1) 0 in
(* initialize v0: v0(i) = A(0)(i) = delete i chars from t *) (* initialize v0: v0(i) = A(0)(i) = delete i chars from t *)
let lower_bound = ref max_int in
for i = 0 to length s2 do for i = 0 to length s2 do
v0.(i) <- i v0.(i) <- i
done; done;
@ -611,19 +612,45 @@ let edit_distance s1 s2 =
(* try add/delete/replace operations *) (* try add/delete/replace operations *)
for j = 0 to length s2 - 1 do for j = 0 to length s2 - 1 do
let cost = if Char.compare (String.get s1 i) (String.get s2 j) = 0 then 0 else 1 in let cost = if Char.equal (String.get s1 i) (String.get s2 j) then 0 else 1 in
v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost)); v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost));
done; done;
if cutoff < Array.length v1 && i <= 2 * cutoff &&
2 * cutoff - i < String.length s2 then (
lower_bound := min !lower_bound v1.(2 * cutoff - i);
);
(* did we compute up to the diagonal 2*cutoff+1? *)
if cutoff < Array.length v1 && i = cutoff * 2 && !lower_bound >= cutoff then (
raise_notrace Exit;
);
(* copy v1 into v0 for next iteration *) (* copy v1 into v0 for next iteration *)
Array.blit v1 0 v0 0 (length s2 + 1); Array.blit v1 0 v0 0 (length s2 + 1);
done; done;
v1.(length s2) v1.(length s2)
end with Exit -> cutoff
(*$Q (*$Q
Q.(string_of_size Gen.(0 -- 30)) (fun s -> \ Q.(string_of_size Gen.(0 -- 30)) (fun s -> \
edit_distance s s = 0) edit_distance s s = 0)
Q.(let p = string_of_size Gen.(0 -- 20) in pair p p) (fun (s1,s2) -> \
edit_distance s1 s2 = edit_distance s2 s1)
Q.(let p = string_of_size Gen.(0 -- 20) in pair p p) (fun (s1,s2) -> \
let e = edit_distance s1 s2 in \
let e' = edit_distance ~cutoff:3 s1 s2 in \
(if e' < 3 then e=e' else e >= 3) && \
(if e <= 3 then e=e' else true))
*)
(*$= & ~printer:string_of_int
2 (edit_distance "hello" "helo!")
5 (edit_distance "abcde" "tuvwx")
2 (edit_distance ~cutoff:2 "abcde" "tuvwx")
1 (edit_distance ("a" ^ String.make 100 '_') ("b"^String.make 100 '_'))
1 (edit_distance ~cutoff:4 ("a" ^ String.make 1000 '_') ("b"^String.make 1000 '_'))
2 (edit_distance ~cutoff:3 ("a" ^ String.make 1000 '_' ^ "c")\
("b" ^ String.make 1000 '_' ^ "d"))
*) *)
(* test that building a from s, and mutating one char of s, yields (* test that building a from s, and mutating one char of s, yields

View file

@ -405,7 +405,11 @@ val compare_natural : string -> string -> int
https://en.wikipedia.org/wiki/Natural_sort_order https://en.wikipedia.org/wiki/Natural_sort_order
@since 1.3 *) @since 1.3 *)
val edit_distance : string -> string -> int val edit_distance : ?cutoff:int -> string -> string -> int
(** Edition distance between two strings. This satisfies the classical (** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c]. *) the formula [distance a b + distance b c >= distance a c].
@param cutoff if provided, it's a cap on both the number of iterations,
and on the result. (since 3.0). This is useful if you just want to
check whether the edit distance is less or equal than 2 (use cutoff of 3).
*)

View file

@ -424,7 +424,11 @@ val compare_natural : string -> string -> int
https://en.wikipedia.org/wiki/Natural_sort_order https://en.wikipedia.org/wiki/Natural_sort_order
@since 1.3 *) @since 1.3 *)
val edit_distance : string -> string -> int val edit_distance : ?cutoff:int -> string -> string -> int
(** Edition distance between two strings. This satisfies the classical (** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c]. *) the formula [distance a b + distance b c >= distance a c].
@param cutoff if provided, it's a cap on both the number of iterations,
and on the result. (since 3.0). This is useful if you just want to
check whether the edit distance is less or equal than 2 (use cutoff of 3).
*)