From bd7a9ce0709c09e53cf1986f6cf2e3b4363dd2fb Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Thu, 3 Nov 2016 20:27:26 +0100 Subject: [PATCH] add `CCString.edit_distance` --- src/core/CCString.cppo.ml | 32 ++++++++++++++++++++++++++++++++ src/core/CCString.mli | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/src/core/CCString.cppo.ml b/src/core/CCString.cppo.ml index 55018995..719de108 100644 --- a/src/core/CCString.cppo.ml +++ b/src/core/CCString.cppo.ml @@ -407,6 +407,38 @@ let compare_versions a b = in cmp_rec (Split.gen_cpy ~by:"." a) (Split.gen_cpy ~by:"." b) +let edit_distance s1 s2 = + if length s1 = 0 + then length s2 + else if length s2 = 0 + then length s1 + else if s1 = s2 + then 0 + else begin + (* distance vectors (v0=previous, v1=current) *) + let v0 = Array.make (length s2 + 1) 0 in + let v1 = Array.make (length s2 + 1) 0 in + (* initialize v0: v0(i) = A(0)(i) = delete i chars from t *) + for i = 0 to length s2 do + v0.(i) <- i + done; + (* main loop for the bottom up dynamic algorithm *) + for i = 0 to length s1 - 1 do + (* first edit distance is the deletion of i+1 elements from s *) + v1.(0) <- i+1; + + (* try add/delete/replace operations *) + for j = 0 to length s2 - 1 do + let cost = if Char.compare (String.get s1 i) (String.get s2 j) = 0 then 0 else 1 in + v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost)); + done; + + (* copy v1 into v0 for next iteration *) + Array.blit v1 0 v0 0 (length s2 + 1); + done; + v1.(length s2) + end + let repeat s n = assert (n>=0); let len = String.length s in diff --git a/src/core/CCString.mli b/src/core/CCString.mli index a7058e43..624eefce 100644 --- a/src/core/CCString.mli +++ b/src/core/CCString.mli @@ -517,6 +517,40 @@ val compare_versions : string -> string -> int *) +val edit_distance : string -> string -> int +(** Edition distance between two strings. This satisfies the classical + distance axioms: it is always positive, symmetric, and satisfies + the formula [distance a b + distance b c >= distance a c] *) + +(*$Q + Q.(string_of_size Gen.(0 -- 30)) (fun s -> \ + edit_distance s s = 0) +*) + +(* test that building a from s, and mutating one char of s, yields + a string s' that is accepted by a. + + --> generate triples (s, i, c) where c is a char, s a non empty string + and i a valid index in s +*) + +(*$QR + ( + let gen = Q.Gen.( + 3 -- 10 >>= fun len -> + 0 -- (len-1) >>= fun i -> + string_size (return len) >>= fun s -> + char >|= fun c -> (s,i,c) + ) in + let small (s,_,_) = String.length s in + Q.make ~small gen + ) + (fun (s,i,c) -> + let s' = Bytes.of_string s in + Bytes.set s' i c; + edit_distance s (Bytes.to_string s') <= 1) +*) + (** {2 Slices} A contiguous part of a string *) module Sub : sig