diff --git a/README.md b/README.md index 43610d5b..377b6852 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ structures comprise (some modules in `misc/`, some other in `core/`): - `CCKTree`, an abstract lazy tree structure (similar to what `CCKlist` is to lists) - small modules (basic types, utilities): - `CCInt` + - `CCString` (basic string operations) - `CCPair` (cartesian products) - `CCOpt` (options) - `CCFun` (function combinators) @@ -70,6 +71,12 @@ structures comprise (some modules in `misc/`, some other in `core/`): - `CCHash` (hashing combinators) - `CCError` (monadic error handling) +### String + +In the module `Containers_string`: +- `Levenshtein`: edition distance between two strings +- `KMP`: Knuth-Morris-Pratt substring algorithm + ### Misc - `PHashtbl`, a polymorphic hashtable (with open addressing) diff --git a/_oasis b/_oasis index 5f35da1d..df6756cb 100644 --- a/_oasis +++ b/_oasis @@ -47,13 +47,13 @@ Library "containers" CCMultiSet, CCBV, CCPrint, CCPersistentHashtbl, CCError, CCHeap, CCList, CCOpt, CCPair, CCFun, CCHash, CCKList, CCInt, CCBool, CCArray, CCBatch, CCOrd, - CCRandom, CCLinq, CCKTree, CCTrie + CCRandom, CCLinq, CCKTree, CCTrie, CCString FindlibName: containers Library "containers_string" Path: string Pack: true - Modules: KMP, CCString, Levenshtein + Modules: KMP, Levenshtein FindlibName: string FindlibParent: containers diff --git a/core/CCString.ml b/core/CCString.ml new file mode 100644 index 00000000..ff37b99d --- /dev/null +++ b/core/CCString.ml @@ -0,0 +1,265 @@ + +(* +copyright (c) 2013-2014, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Basic String Utils} *) + +type 'a gen = unit -> 'a option +type 'a sequence = ('a -> unit) -> unit +type 'a klist = unit -> [`Nil | `Cons of 'a * 'a klist] + +module type S = sig + type t + + val length : t -> int + + val blit : t -> int -> t -> int -> int -> unit + (** See {!String.blit} *) + + (** {2 Conversions} *) + + val to_gen : t -> char gen + + val to_seq : t -> char sequence + + val to_klist : t -> char klist + + val pp : Buffer.t -> t -> unit +end + +type t = string + +let equal a b = a=b + +let compare = String.compare + +let hash s = Hashtbl.hash s + +let length = String.length + +let _is_sub ~sub i s j ~len = + let rec check k = + if k = len + then true + else sub.[i + k] = s.[j+k] && check (k+1) + in + j+len <= String.length s && check 0 + +let is_sub ~sub i s j ~len = + if i+len > String.length sub then invalid_arg "String.is_sub"; + _is_sub ~sub i s j ~len + + +module Split = struct + type split_state = + | SplitStop + | SplitAt of int (* previous *) + + (* [by_j... prefix of s_i...] ? *) + let rec _is_prefix ~by s i j = + j = String.length by + || + ( i < String.length s && + s.[i] = by.[j] && + _is_prefix ~by s (i+1) (j+1) + ) + + let rec _split ~by s state = match state with + | SplitStop -> None + | SplitAt prev -> _split_search ~by s prev prev + and _split_search ~by s prev i = + if i >= String.length s + then Some (SplitStop, prev, String.length s - prev) + else if _is_prefix ~by s i 0 && i>prev + then Some (SplitAt (i+String.length by), prev, i-prev) + else _split_search ~by s prev (i+1) + + let _tuple3 x y z = x,y,z + + let _mkgen ~by s k = + let state = ref (SplitAt 0) in + fun () -> + match _split ~by s !state with + | None -> None + | Some (state', i, len) -> + state := state'; + Some (k s i len) + + let gen ~by s = _mkgen ~by s _tuple3 + + let gen_cpy ~by s = _mkgen ~by s String.sub + + let _mklist ~by s k = + let rec build acc state = match _split ~by s state with + | None -> List.rev acc + | Some (state', i, len) -> + build (k s i len ::acc) state' + in + build [] (SplitAt 0) + + let list_ ~by s = _mklist ~by s _tuple3 + + let list_cpy ~by s = _mklist ~by s String.sub + + let _mkklist ~by s k = + let rec make state () = match _split ~by s state with + | None -> `Nil + | Some (state', i, len) -> + `Cons (k s i len , make state') + in make (SplitAt 0) + + let klist ~by s = _mkklist ~by s _tuple3 + + let klist_cpy ~by s = _mkklist ~by s String.sub + + let _mkseq ~by s f k = + let rec aux state = match _split ~by s state with + | None -> () + | Some (state', i, len) -> k (f s i len); aux state' + in aux (SplitAt 0) + + let seq ~by s = _mkseq ~by s _tuple3 + let seq_cpy ~by s = _mkseq ~by s String.sub + + (*$T + Split.list_cpy ~by:"," "aa,bb,cc" = ["aa"; "bb"; "cc"] + Split.list_cpy ~by:"--" "a--b----c--" = ["a"; "b"; ""; "c"; ""] + *) +end + +(* note: inefficient *) +let find ?(start=0) ~sub s = + let n = String.length sub in + let i = ref start in + try + while !i + n < String.length s do + if _is_sub ~sub 0 s !i ~len:n then raise Exit; + incr i + done; + -1 + with Exit -> + !i + +let repeat s n = + assert (n>=0); + let len = String.length s in + assert(len > 0); + let buf = String.create (len * n) in + for i = 0 to n-1 do + String.blit s 0 buf (i * len) len; + done; + buf + +let prefix ~pre s = + String.length pre <= String.length s && + (let i = ref 0 in + while !i < String.length pre && s.[!i] = pre.[!i] do incr i done; + !i = String.length pre) + +let blit = String.blit + +let _to_gen s i0 len = + let i = ref i0 in + fun () -> + if !i = i0+len then None + else ( + let c = String.unsafe_get s !i in + incr i; + Some c + ) + +let to_gen s = _to_gen s 0 (String.length s) + +let of_gen g = + let b = Buffer.create 32 in + let rec aux () = match g () with + | None -> Buffer.contents b + | Some c -> Buffer.add_char b c; aux () + in aux () + +let to_seq s k = String.iter k s + +let of_seq seq = + let b= Buffer.create 32 in + seq (Buffer.add_char b); + Buffer.contents b + +let rec _to_klist s i len () = + if len=0 then `Nil + else `Cons (s.[i], _to_klist s (i+1)(len-1)) + +let of_klist l = + let rec aux acc n l = match l() with + | `Nil -> + let s = String.create n in + let acc = ref acc in + for i=n-1 downto 0 do + s.[i] <- List.hd !acc; + acc := List.tl !acc + done; + s + | `Cons (x,l') -> aux (x::acc) (n+1) l' + in aux [] 0 l + +let to_klist s = _to_klist s 0 (String.length s) + +let pp buf s = + Buffer.add_char buf '"'; + Buffer.add_string buf s; + Buffer.add_char buf '"' + +module Sub = struct + type t = string * int * int + + let make s i ~len = + if i<0||len<0||i+len > String.length s then invalid_arg "CCString.Sub.make"; + s,i,len + + let full s = s, 0, String.length s + + let copy (s,i,len) = String.sub s i len + + let underlying (s,_,_) = s + + let sub (s,i,len) i' len' = + if i+i' + len' > i+len then invalid_arg "CCString.Sub.sub"; + (s, i+i',len') + + let length (_,_,l) = l + + let blit (a1,i1,len1) o1 (a2,i2,len2) o2 len = + if o1+len>len1 || o2+len>len2 then invalid_arg "CCString.Sub.blit"; + String.blit a1 (i1+o1) a2 (i2+o2) len + + let to_gen (s,i,len) = _to_gen s i len + let to_seq (s,i,len) k = + for i=i to i+len-1 do k s.[i] done + let to_klist (s,i,len) = _to_klist s i len + + let pp buf (s,i,len) = + Buffer.add_char buf '"'; + Buffer.add_substring buf s i len; + Buffer.add_char buf '"' +end diff --git a/core/CCString.mli b/core/CCString.mli new file mode 100644 index 00000000..65997b77 --- /dev/null +++ b/core/CCString.mli @@ -0,0 +1,141 @@ + +(* +copyright (c) 2013-2014, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Basic String Utils} +Consider using {!Containers_string.KMP} for pattern search, or Regex +libraries. *) + +type 'a gen = unit -> 'a option +type 'a sequence = ('a -> unit) -> unit +type 'a klist = unit -> [`Nil | `Cons of 'a * 'a klist] + +(** {2 Common Signature} *) + +module type S = sig + type t + + val length : t -> int + + val blit : t -> int -> t -> int -> int -> unit + (** See {!String.blit} *) + + (** {2 Conversions} *) + + val to_gen : t -> char gen + + val to_seq : t -> char sequence + + val to_klist : t -> char klist + + val pp : Buffer.t -> t -> unit +end + +(** {2 Strings} *) + +type t = string + +val equal : t -> t -> bool + +val compare : t -> t -> int + +val hash : t -> int + +val of_gen : char gen -> t + +val of_seq : char sequence -> t + +val of_klist : char klist -> t + +val find : ?start:int -> sub:t -> t -> int +(** Find [sub] in the string, returns its first index or -1. + Should only be used with very small [sub] *) + +val is_sub : sub:t -> int -> t -> int -> len:int -> bool +(** [is_sub ~sub i s j ~len] returns [true] iff the substring of + [sub] starting at position [i] and of length [len], + is a substring of [s] starting at position [j] *) + +val repeat : t -> int -> t +(** The same string, repeated n times *) + +val prefix : pre:t -> t -> bool +(** [str_prefix ~pre s] returns [true] iff [pre] is a prefix of [s] *) + +include S with type t := t + +(** {2 Splitting} *) + +module Split : sig + val list_ : by:t -> t -> (t*int*int) list + (** split the given string along the given separator [by]. Should only + be used with very small separators, otherwise + use {!Containers_string.KMP}. + @return a list of (index,length) of substrings of [s] that are + separated by [by]. {!String.sub} can then be used to actually extract + the slice. + @raise Failure if [by = ""] *) + + val gen : by:t -> t -> (t*int*int) gen + + val seq : by:t -> t -> (t*int*int) sequence + + val klist : by:t -> t -> (t*int*int) klist + + (** {6 Copying functions} + + Those split functions actually copy the substrings, which can be + more convenient but less efficient in general *) + + val list_cpy : by:t -> t -> t list + + val gen_cpy : by:t -> t -> t gen + + val seq_cpy : by:t -> t -> t sequence + + val klist_cpy : by:t -> t -> t klist +end + +(** {2 Slices} A contiguous part of a string *) + +module Sub : sig + type t = string * int * int + (** A string, an offset, and the length of the slice *) + + val make : string -> int -> len:int -> t + + val full : string -> t + (** Full string *) + + val copy : t -> string + (** Make a copy of the substring *) + + val underlying : t -> string + + val sub : t -> int -> int -> t + (** Sub-slice *) + + include S with type t := t +end diff --git a/string/CCString.ml b/string/CCString.ml deleted file mode 100644 index a34614cf..00000000 --- a/string/CCString.ml +++ /dev/null @@ -1,150 +0,0 @@ - -(* -copyright (c) 2013-2014, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - -(** {1 Basic String Utils} *) - -type t = string - -let equal a b = a=b - -let compare = String.compare - -let hash s = Hashtbl.hash s - -type 'a gen = unit -> 'a option -type 'a sequence = ('a -> unit) -> unit - -let is_sub ~sub i s j = - let rec check k = - if i + k = String.length sub - then true - else sub.[i + k] = s.[j+k] && check (k+1) - in - check 0 - -(* note: quite inefficient if [by] is long *) -let split_gen ~by s = - let len_by = String.length by in - assert (len_by > 0); - let n = String.length s in - let prev = ref 0 in - let stop = ref false in - let rec search i = - if !stop then None - else if i >= n - then ( - stop := true; - Some (String.sub s !prev (n- !prev)) (* done *) - ) - else if is_prefix i 0 - then ( - let p = !prev in - prev := i+len_by; - Some (String.sub s p (i-p)) - ) - else search (i+1) - and is_prefix i j = - if j = len_by - then true - else if i = n - then false - else s.[i] = by.[j] && is_prefix (i+1) (j+1) - in - fun () -> - search !prev - -let split_seq ~by s k = - let rec aux g = match g () with - | None -> () - | Some x -> k x; aux g - in aux (split_gen ~by s) - -let split ~by s = - let rec aux g acc = match g () with - | None -> List.rev acc - | Some x -> aux g (x::acc) - in aux (split_gen ~by s) [] - -(*$T - split ~by:"," "aa,bb,cc" = ["aa"; "bb"; "cc"] - split ~by:"--" "a--b----c--" = ["a"; "b"; ""; "c"; ""] -*) - -(* note: inefficient *) -let find ?(start=0) ~sub s = - let n = String.length sub in - let i = ref start in - try - while !i + n < String.length s do - if is_sub ~sub 0 s !i then raise Exit; - incr i - done; - -1 - with Exit -> - !i - -let repeat s n = - assert (n>=0); - let len = String.length s in - assert(len > 0); - let buf = String.create (len * n) in - for i = 0 to n-1 do - String.blit s 0 buf (i * len) len; - done; - buf - -let prefix ~pre s = - String.length pre <= String.length s && - (let i = ref 0 in - while !i < String.length pre && s.[!i] = pre.[!i] do incr i done; - !i = String.length pre) - - -let to_gen s = - let i = ref 0 in - fun () -> - if !i = String.length s then None - else ( - let c = String.unsafe_get s !i in - incr i; - Some c - ) - -let of_gen g = - let b = Buffer.create 32 in - let rec aux () = match g () with - | None -> Buffer.contents b - | Some c -> Buffer.add_char b c; aux () - in aux () - -let to_seq s k = String.iter k s - -let of_seq seq = - let b= Buffer.create 32 in - seq (Buffer.add_char b); - Buffer.contents b - -let pp = Buffer.add_string diff --git a/string/CCString.mli b/string/CCString.mli deleted file mode 100644 index 50ba1bcf..00000000 --- a/string/CCString.mli +++ /dev/null @@ -1,70 +0,0 @@ - -(* -copyright (c) 2013-2014, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - -(** {1 Basic String Utils} -Consider using KMP instead. *) - -type t = string - -val equal : t -> t -> bool - -val compare : t -> t -> int - -val hash : t -> int - -type 'a gen = unit -> 'a option -type 'a sequence = ('a -> unit) -> unit - -val is_sub : sub:t -> int -> t -> int -> bool -(** [is_sub ~sub i s j] returns [true] iff [sub] is a substring of [s] starting - at position [j] *) - -val split : by:t -> t -> t list -(** split the given string along the given separator [by]. Should only - be used with very small separators, otherwise use {!KMP}. - @raise Failure if [by = ""] *) - -val split_gen : by:t -> t -> t gen - -val split_seq : by:t -> t -> t sequence - -val find : ?start:int -> sub:t -> t -> int -(** Find [sub] in the string, returns its first index or -1. - Should only be used with very small [sub] *) - -val repeat : t -> int -> t -(** The same string, repeated n times *) - -val prefix : pre:t -> t -> bool -(** [str_prefix ~pre s] returns [true] iff [pre] is a prefix of [s] *) - -val to_gen : t -> char gen -val of_gen : char gen -> t - -val to_seq : t -> char sequence -val of_seq : char sequence -> t - -val pp : Buffer.t -> t -> unit