add CCString.edit_distance

This commit is contained in:
Simon Cruanes 2016-11-03 20:27:26 +01:00
parent 72d43c6eeb
commit bd7a9ce070
2 changed files with 66 additions and 0 deletions

View file

@ -407,6 +407,38 @@ let compare_versions a b =
in
cmp_rec (Split.gen_cpy ~by:"." a) (Split.gen_cpy ~by:"." b)
let edit_distance s1 s2 =
if length s1 = 0
then length s2
else if length s2 = 0
then length s1
else if s1 = s2
then 0
else begin
(* distance vectors (v0=previous, v1=current) *)
let v0 = Array.make (length s2 + 1) 0 in
let v1 = Array.make (length s2 + 1) 0 in
(* initialize v0: v0(i) = A(0)(i) = delete i chars from t *)
for i = 0 to length s2 do
v0.(i) <- i
done;
(* main loop for the bottom up dynamic algorithm *)
for i = 0 to length s1 - 1 do
(* first edit distance is the deletion of i+1 elements from s *)
v1.(0) <- i+1;
(* try add/delete/replace operations *)
for j = 0 to length s2 - 1 do
let cost = if Char.compare (String.get s1 i) (String.get s2 j) = 0 then 0 else 1 in
v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost));
done;
(* copy v1 into v0 for next iteration *)
Array.blit v1 0 v0 0 (length s2 + 1);
done;
v1.(length s2)
end
let repeat s n =
assert (n>=0);
let len = String.length s in

View file

@ -517,6 +517,40 @@ val compare_versions : string -> string -> int
*)
val edit_distance : string -> string -> int
(** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c] *)
(*$Q
Q.(string_of_size Gen.(0 -- 30)) (fun s -> \
edit_distance s s = 0)
*)
(* test that building a from s, and mutating one char of s, yields
a string s' that is accepted by a.
--> generate triples (s, i, c) where c is a char, s a non empty string
and i a valid index in s
*)
(*$QR
(
let gen = Q.Gen.(
3 -- 10 >>= fun len ->
0 -- (len-1) >>= fun i ->
string_size (return len) >>= fun s ->
char >|= fun c -> (s,i,c)
) in
let small (s,_,_) = String.length s in
Q.make ~small gen
)
(fun (s,i,c) ->
let s' = Bytes.of_string s in
Bytes.set s' i c;
edit_distance s (Bytes.to_string s') <= 1)
*)
(** {2 Slices} A contiguous part of a string *)
module Sub : sig