refactor and clarify cutoff in String.edit_distance

This commit is contained in:
Simon Cruanes 2021-04-08 11:20:54 -04:00
parent 45b3956421
commit e75d93bb9d
2 changed files with 14 additions and 9 deletions

View file

@ -576,12 +576,12 @@ let compare_natural a b =
*) *)
let edit_distance ?(cutoff=max_int) s1 s2 = let edit_distance ?(cutoff=max_int) s1 s2 =
if length s1 = 0 let n1 = length s1 in
then min cutoff (length s2) let n2 = length s2 in
else if length s2 = 0 if n1 = 0 then min cutoff n2
then min cutoff (length s1) else if n2 = 0 then min cutoff n1
else if equal s1 s2 else if equal s1 s2 then 0
then 0 else if n1-n2 >= cutoff || n2-n1 >= cutoff then cutoff (* at least cutoff inserts *)
else try else try
(* distance vectors (v0=previous, v1=current) *) (* distance vectors (v0=previous, v1=current) *)
let v0 = Array.make (length s2 + 1) 0 in let v0 = Array.make (length s2 + 1) 0 in

View file

@ -462,9 +462,14 @@ val edit_distance : ?cutoff:int -> string -> string -> int
(** [edit_distance ~cutoff s1 s2] is the edition distance between the two strings [s1] and [s2]. (** [edit_distance ~cutoff s1 s2] is the edition distance between the two strings [s1] and [s2].
This satisfies the classical distance axioms: it is always positive, symmetric, and satisfies This satisfies the classical distance axioms: it is always positive, symmetric, and satisfies
the formula [distance s1 s2 + distance s2 s3 >= distance s1 s3]. the formula [distance s1 s2 + distance s2 s3 >= distance s1 s3].
@param cutoff if provided, it's a cap on both the number of iterations,
and on the result. (since 3.0). This is useful if you just want to @param cutoff if provided, it's a cap on the number of iterations.
check whether the edit distance is less or equal than 2 (use cutoff of 3). (since 3.0). This is useful if you just want to
check whether the edit distance is less or equal than 2 without
(use [edit_distance s1 s2 ~cutoff:3 <= 2]).
{b note} that contrary to what was previously documented here, the result can
still be higher than [cutoff] if it's reached in [<cutoff] iterations.
However if the result is [< cutoff] then it is accurate.
*) *)
(** {2 Infix operators} (** {2 Infix operators}