From e75d93bb9d0fbe3415baa3e53bb87ebf03fe2066 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Thu, 8 Apr 2021 11:20:54 -0400 Subject: [PATCH] refactor and clarify `cutoff` in `String.edit_distance` --- src/core/CCString.ml | 12 ++++++------ src/core/CCString.mli | 11 ++++++++--- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/core/CCString.ml b/src/core/CCString.ml index 8c276c2c..add7afd4 100644 --- a/src/core/CCString.ml +++ b/src/core/CCString.ml @@ -576,12 +576,12 @@ let compare_natural a b = *) let edit_distance ?(cutoff=max_int) s1 s2 = - if length s1 = 0 - then min cutoff (length s2) - else if length s2 = 0 - then min cutoff (length s1) - else if equal s1 s2 - then 0 + let n1 = length s1 in + let n2 = length s2 in + if n1 = 0 then min cutoff n2 + else if n2 = 0 then min cutoff n1 + else if equal s1 s2 then 0 + else if n1-n2 >= cutoff || n2-n1 >= cutoff then cutoff (* at least cutoff inserts *) else try (* distance vectors (v0=previous, v1=current) *) let v0 = Array.make (length s2 + 1) 0 in diff --git a/src/core/CCString.mli b/src/core/CCString.mli index 2f0b9c0d..d50a4f68 100644 --- a/src/core/CCString.mli +++ b/src/core/CCString.mli @@ -462,9 +462,14 @@ val edit_distance : ?cutoff:int -> string -> string -> int (** [edit_distance ~cutoff s1 s2] is the edition distance between the two strings [s1] and [s2]. This satisfies the classical distance axioms: it is always positive, symmetric, and satisfies the formula [distance s1 s2 + distance s2 s3 >= distance s1 s3]. - @param cutoff if provided, it's a cap on both the number of iterations, - and on the result. (since 3.0). This is useful if you just want to - check whether the edit distance is less or equal than 2 (use cutoff of 3). + + @param cutoff if provided, it's a cap on the number of iterations. + (since 3.0). This is useful if you just want to + check whether the edit distance is less or equal than 2 without + (use [edit_distance s1 s2 ~cutoff:3 <= 2]). + {b note} that contrary to what was previously documented here, the result can + still be higher than [cutoff] if it's reached in [