diff --git a/README.adoc b/README.adoc index be175195..7f69a87f 100644 --- a/README.adoc +++ b/README.adoc @@ -22,11 +22,6 @@ cross-module dependencies). containers.data:: with additional data structures that don't have an equivalent in the standard library; containers.iter:: with list-like and tree-like iterators; - containers.string:: (in directory `string`) with - a few packed modules that deal with strings (Levenshtein distance, - KMP search algorithm, and a few naive utils). Again, modules are independent - and sometimes parametric on the string and char types (so they should - be able to deal with your favorite unicode library). - A sub-library with complicated abstractions, `containers.advanced` (with a LINQ-like query module, batch operations using GADTs, and others). @@ -191,14 +186,6 @@ Iterators: - `CCKList`, a persistent iterator structure (akin to a lazy list, without memoization) - `CCKTree`, an abstract lazy tree structure -=== String - -See http://cedeela.fr/~simon/software/containers/Containers_string[doc]. - -In the module `Containers_string`: -- `Levenshtein`: edition distance between two strings -- `KMP`: Knuth-Morris-Pratt substring algorithm -- `Parse`: simple parser combinators === Thread diff --git a/_oasis b/_oasis index 4ae7b35f..31c2f04e 100644 --- a/_oasis +++ b/_oasis @@ -78,13 +78,6 @@ Library "containers_iter" FindlibParent: containers FindlibName: iter -Library "containers_string" - Path: src/string - Modules: Containers_string, CCLevenshtein, CCApp_parse - BuildDepends: bytes - FindlibName: string - FindlibParent: containers - Library "containers_thread" Path: src/threads/ Modules: CCPool, CCLock, CCSemaphore, CCThread, CCBlockingQueue, @@ -102,7 +95,6 @@ Library "containers_top" FindlibName: top FindlibParent: containers BuildDepends: compiler-libs.common, containers, containers.data, - containers.bigarray, containers.string, containers.unix, containers.sexp, containers.iter Document containers @@ -116,8 +108,7 @@ Document containers "-docflags '-colorize-code -short-functors -charset utf-8'" XOCamlbuildLibraries: containers, containers.iter, containers.data, - containers.string, containers.thread, - containers.unix, containers.sexp + containers.thread, containers.unix, containers.sexp Executable run_benchs Path: benchs/ @@ -126,7 +117,7 @@ Executable run_benchs Build$: flag(bench) MainIs: run_benchs.ml BuildDepends: containers, qcheck, - containers.data, containers.string, containers.iter, + containers.data, containers.iter, containers.thread, sequence, gen, benchmark, hamt Executable run_bench_hash @@ -145,9 +136,8 @@ Executable run_qtest CompiledObject: best MainIs: run_qtest.ml Build$: flag(tests) && flag(bigarray) && flag(unix) - BuildDepends: containers, containers.string, containers.iter, - containers.sexp, - containers.unix, containers.thread, + BuildDepends: containers, containers.iter, + containers.sexp, containers.unix, containers.thread, containers.data, sequence, gen, unix, oUnit, qcheck diff --git a/doc/intro.txt b/doc/intro.txt index 253dc5d9..a03d509d 100644 --- a/doc/intro.txt +++ b/doc/intro.txt @@ -120,17 +120,6 @@ CCKList CCKTree CCLazy_list} -{4 String} - -{b findlib name}: containers.string - -{!modules: -CCApp_parse -CCKMP -CCLevenshtein -CCParse -} - {4 Misc} Moved to its own repository. diff --git a/src/string/CCApp_parse.ml b/src/string/CCApp_parse.ml deleted file mode 100644 index 19fe4001..00000000 --- a/src/string/CCApp_parse.ml +++ /dev/null @@ -1,834 +0,0 @@ -(* -copyright (c) 2013-2015, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - -(** {1 Applicative Parser Combinators} *) - -type ('a,'b) result = [`Error of 'b | `Ok of 'a] - -type multiplicity = - | Star (* 0 or more *) - | Plus (* 1 or more *) - | Question (* 0 or 1 *) - -let str fmt = Printf.sprintf fmt - -module CharSet = Set.Make(Char) -module CharMap = Map.Make(Char) - -let print_char = function - | '\t' -> "\\t" - | '\n' -> "\\n" - | '\r' -> "\\r" - | '"' -> "\\\"" - | c -> str "%c" c - -let print_char_set set = - let buf = Buffer.create 32 in - Buffer.add_char buf '"'; - CharSet.iter (fun c -> Buffer.add_string buf (print_char c)) set; - Buffer.add_char buf '"'; - Buffer.contents buf - -let domain_of_char_map m = - CharMap.fold (fun c _ set -> CharSet.add c set) m CharSet.empty - -let print_char_map map = - let l = CharMap.fold - (fun c _ acc -> print_char c :: acc) map [] in - String.concat ", " l - -let ppmap ?(sep=", ") pp_k pp_v fmt m = - let first = ref true in - CharMap.iter - (fun k v -> - if !first then first := false else Format.pp_print_string fmt sep; - pp_k fmt k; - Format.pp_print_string fmt " → "; - pp_v fmt v; - Format.pp_print_cut fmt () - ) m; - () - -let set_of_string s = - let set = ref CharSet.empty in - String.iter - (fun c -> - if CharSet.mem c !set - then invalid_arg (str "any_of: duplicate char %c" c); - set := CharSet.add c !set - ) s; - !set - -(* add [c -> p] to the map, for every [c] in [set] *) -let map_add_set init set p = - CharSet.fold - (fun c map -> CharMap.add c p map) - set init - -(* function composition *) -let compose f g x = f (g x) - -let str_of_l l = - let b = Bytes.make (List.length l) ' ' in - List.iteri (fun i c -> Bytes.set b i c) l; - Bytes.unsafe_to_string b - -type 'a t = { - mutable st : 'a parse_or_compiled; -} - -(* syntactic version *) -and _ parse = - | Many : 'a t * unit t * multiplicity -> 'a list parse - | Skip : 'a t * multiplicity -> unit parse (* same as Many, but ignores *) - | Lazy : 'a t lazy_t -> 'a parse - -(* compiled version *) -and _ compiled = - | C_Return : 'a -> 'a compiled - | C_Map : ('a -> 'b) * 'a t -> 'b compiled - | C_Filter: ('a -> bool) * 'a t -> 'a compiled - | C_App : ('a -> 'b) t * 'a t -> 'b compiled - | C_AppLeft : 'a t * 'b t -> 'a compiled - | C_AppRight : 'a t * 'b t -> 'b compiled - | C_Fail : string -> 'a compiled - | C_Int : int compiled - | C_Float : float compiled - | C_Junk : unit compiled (* ignore next char *) - | C_AnyOf : CharSet.t -> char compiled - | C_SwitchC : 'a t CharMap.t * 'a t option -> 'a compiled - | C_Eof : unit compiled - -and 'a parse_or_compiled = - | Parse of 'a parse - | Compiled of 'a compiled - -(** {2 Helpers} *) - -(* build a new parser *) -let make p = {st=Parse p} -let make_c c = {st=Compiled c} -let make_pc st = {st} - -let ppmult fmt = function - | Star -> Format.pp_print_string fmt "*" - | Plus -> Format.pp_print_string fmt "+" - | Question -> Format.pp_print_string fmt "?" - -let print fmt p = - let depth = ref 0 in - (* print up to a given limit into lazy values *) - let rec print_aux - : type a. Format.formatter -> a t -> unit - = fun fmt p -> - let ppstr = Format.pp_print_string - and ppf fmt x = Format.fprintf fmt x in - let ppc fmt c = ppf fmt "'%s'" (print_char c) in - match p.st with - | Compiled (C_Return _) -> ppstr fmt "" - | Compiled (C_Map (_, x)) -> ppf fmt "@[(map@ %a)@]" print_aux x - | Compiled (C_Filter (_, x)) -> ppf fmt "@[(filter@ %a)@]" print_aux x - | Compiled (C_App (f, x)) -> ppf fmt "@[<2>@[%a@]@ <*>@ @[%a@]@]" print_aux f print_aux x - | Compiled (C_AppLeft (a, b)) -> ppf fmt "@[%a@ <<@ %a@]" print_aux a print_aux b - | Compiled (C_AppRight (a, b)) -> ppf fmt "@[%a@ >>@ %a@]" print_aux a print_aux b - | Compiled (C_Fail _) -> ppf fmt "" - | Compiled C_Int -> ppstr fmt "" - | Compiled C_Float -> ppstr fmt "" - | Compiled C_Junk -> ppstr fmt "" - | Compiled (C_AnyOf set) -> ppf fmt "@[(any@ %s)@]" (print_char_set set) - | Parse (Many (p, sep, mult)) -> - ppf fmt "@[<2>(@[%a@]@ sep:@[%a@])%a@]" print_aux p print_aux sep ppmult mult - | Parse (Skip (p, mult)) -> - ppf fmt "@[<2>(skip @[%a@]%a)@]" print_aux p ppmult mult - | Compiled (C_SwitchC (map, None)) -> - ppf fmt "@[(switch@ @[%a@])@]" (ppmap ppc print_aux) map - | Compiled (C_SwitchC (map, Some o)) -> - ppf fmt "@[(switch@ @[%a@]@ or:%a)@]" (ppmap ppc print_aux) map print_aux o - | Parse (Lazy _) when !depth > 3 -> ppf fmt "" - | Parse (Lazy (lazy p)) -> - incr depth; - print_aux fmt p; - decr depth - | Compiled C_Eof -> ppstr fmt "" - in - print_aux fmt p - -let int_first_char = lazy (set_of_string "-0123456789") -let float_first_char = lazy (set_of_string ".-0123456789") - -(* a set of characters that are valid as first characters of a parser *) -type possible_first_chars = - | Set of CharSet.t - | AllChars - | NoChar - | NoCharOrSet of CharSet.t (* either no char, or something starting with set *) - | IsFail of string - -let ret_set set = match CharSet.cardinal set with - | 0 -> NoChar - | 256 -> AllChars - | _ -> Set set - -let ret_no_char_or set = match CharSet.cardinal set with - | 0 -> NoChar - | 256 -> AllChars - | _ -> NoCharOrSet set - -(* pfc of parsing a or b *) -let union_pfc a b = match a, b with - | Set a, Set b -> ret_set (CharSet.union a b) - | NoCharOrSet s, Set s' - | Set s', NoCharOrSet s -> ret_no_char_or (CharSet.union s s') - | NoChar, Set s - | Set s, NoChar -> ret_no_char_or s - | NoCharOrSet s, NoCharOrSet s' -> ret_no_char_or (CharSet.union s s') - | IsFail e, _ | _, IsFail e -> IsFail e - | AllChars, _ | _, AllChars -> AllChars - | NoChar, o | o, NoChar -> o - -(* pfc of parsing a then b *) -let then_pfc a b = match a, b with - | Set a, Set b -> ret_set (CharSet.union a b) - | NoCharOrSet s, NoCharOrSet s' -> ret_no_char_or (CharSet.union s s') - | NoCharOrSet s, Set s' -> ret_set (CharSet.union s s') - | NoCharOrSet s, NoChar -> ret_no_char_or s - | Set s, _ -> ret_set s - | IsFail e, _ | _, IsFail e -> IsFail e - | AllChars, _ | _, AllChars -> AllChars - | NoChar, o -> o - -let (<|||>) a b = match a with - | NoChar -> Lazy.force b - | NoCharOrSet _ -> then_pfc a (Lazy.force b) - | _ -> a - -(* set of possibilities for the first char of a parser *) -let rec pfc : type a. a t -> possible_first_chars = fun t -> pfc_pc t.st - -and pfc_pc - : type a. a parse_or_compiled -> possible_first_chars - = function - | Parse p -> pfc_p p - | Compiled c -> pfc_c c - -and pfc_p - : type a. a parse -> possible_first_chars - = function - | Many (p, _, (Question | Star)) -> union_pfc (pfc p) NoChar - | Many (p, _, Plus) -> pfc p - | Skip (p, (Question | Star)) -> union_pfc (pfc p) NoChar - | Skip (p, Plus) -> pfc p - | Lazy (lazy p) -> pfc p - -and pfc_c - : type a. a compiled -> possible_first_chars - = function - | C_Return _ -> NoChar - | C_Map (_, x) -> pfc x - | C_Filter (_, x) -> pfc x - | C_App (f, x) -> pfc f <|||> lazy (pfc x) - | C_AppLeft (a, b) -> pfc a <|||> lazy (pfc b) - | C_AppRight (a, b) -> pfc a <|||> lazy (pfc b) - | C_Fail e -> IsFail e - | C_Int -> Set (Lazy.force int_first_char) - | C_Float -> Set (Lazy.force float_first_char) - | C_Junk -> AllChars - | C_AnyOf set -> ret_set set - | C_SwitchC (map, None) -> ret_set (domain_of_char_map map) - | C_SwitchC (map, Some o) -> - let s = domain_of_char_map map in - union_pfc (ret_set s) (pfc o) - | C_Eof -> NoChar - -let possible_first_chars = pfc - -(** {2 Combinators} *) - -let return x = make_c (C_Return x) -let pure = return - -let success = pure () - -let fail msg = make_c (C_Fail msg) - -let junk = make_c C_Junk - -let failf fmt = Printf.ksprintf (fun msg -> fail msg) fmt - -let map f x = match x.st with - | Compiled (C_Map (g, y)) -> make_c (C_Map (compose f g, y)) - | Compiled (C_Return x) -> pure (f x) - | _ -> make_c (C_Map (f, x)) - -let app f x = match f.st with - | Compiled (C_Return f) -> map f x - | _ -> make_c (C_App (f, x)) - -let fun_and f f' x = f x && f' x - -let filter f x = match x.st with - | Compiled (C_Return y) -> if f y then return y else fail "filter failed" - | Compiled (C_Filter (f', y)) -> make_c (C_Filter (fun_and f f', y)) - | _ -> make_c (C_Filter (f, x)) - -let app_left a b = make_c (C_AppLeft (a, b)) (* return (fun x y -> x) <*> a <*> b *) - -let app_right a b = make_c (C_AppRight (a, b)) (* return (fun x y -> y) <*> a <*> b *) - -let int = make_c C_Int - -let float = make_c C_Float - -let many ?(sep=success) p = make (Many (p, sep, Star)) - -let many1 ?(sep=success) p = make (Many (p, sep, Plus)) - -let skip p = make (Skip (p, Star)) - -let skip1 p = make (Skip (p, Plus)) - -let opt p = - map - (function - | [x] -> Some x - | [] -> None - | _ -> assert false - ) (make (Many (p, success, Question))) - -let any_of' s = make_c (C_AnyOf s) -let any_of s = any_of' (set_of_string s) - -let char c = any_of' (CharSet.singleton c) - -let spaces = skip (any_of " \t") -let spaces1 = skip1 (any_of " \t") - -let white = skip (any_of " \t\n") -let white1 = skip1 (any_of " \t\n") - -let alpha_lower_ = set_of_string "abcdefghijklmonpqrstuvwxyz" -let alpha_upper_ = set_of_string "ABCDEFGHIJKLMONPQRSTUVWXYZ" -let num_ = set_of_string "0123456789" -let alpha_ = CharSet.union alpha_lower_ alpha_upper_ -let symbols_ = set_of_string "|!;$#@%&-_/=" - -let alpha_lower = any_of' alpha_lower_ -let alpha_upper = any_of' alpha_upper_ -let num = any_of' num_ -let symbols = any_of' symbols_ -let alpha = any_of' alpha_ -let alpha_num = any_of' (CharSet.union num_ alpha_) - -let eof = make_c C_Eof - -let switch_c ?default l = - if l = [] then match default with - | None -> invalid_arg "switch_c: empty list"; - | Some d -> d - else - let map = List.fold_left - (fun map (c, t) -> - if CharMap.mem c map - then invalid_arg (str "switch_c: duplicate char %c" c); - CharMap.add c t map - ) CharMap.empty l - in - make_c (C_SwitchC (map, default)) - -exception ExnIsFail of string - -let make_switch_c a b = make_c (C_SwitchC (a, b)) - -(* binary choice: compiled into decision tree *) -let rec merge a b = - (* build a switch by first char *) - try - begin match a.st, b.st with - | Compiled (C_SwitchC (map_a, def_a)), - Compiled (C_SwitchC (map_b, def_b)) -> - (* merge jump tables *) - let def = match def_a, def_b with - | None, None -> None - | Some d, None - | None, Some d -> Some d - | Some _, Some _ -> - invalid_arg "choice: ambiguous, several parsers accept any input" - in - let map = CharMap.merge - (fun _ a b -> match a, b with - | Some a', Some b' -> Some (merge a' b') - | Some m, None - | None, Some m -> Some m - | None, None -> assert false - ) map_a map_b - in - make_switch_c map def - | Compiled (C_SwitchC (map, def)), other - | other, Compiled (C_SwitchC (map, def)) -> - let map', def' = match pfc_pc other, def with - | AllChars, _ -> - invalid_arg "choice: ambiguous, several parsers accept any input" - | NoChar, None -> map, Some (make_pc other) - | NoChar, Some _ -> - invalid_arg "choice: ambiguous" - | IsFail msg, _ -> raise (ExnIsFail msg) - | NoCharOrSet set, def - | Set set, def -> - if CharSet.exists (fun c -> CharMap.mem c map) set - then invalid_arg - (str "choice: ambiguous parsers (overlap on {%s})" - (print_char_set (CharSet.inter set (domain_of_char_map map)))); - (* else: merge jump tables *) - let map = map_add_set map set (make_pc other) in - map, def - in - make_switch_c map' def' - | _ -> - begin match possible_first_chars a, possible_first_chars b with - | (Set set1 | NoCharOrSet set1), (Set set2 | NoCharOrSet set2) -> - if CharSet.exists (fun c -> CharSet.mem c set2) set1 - then invalid_arg - (str "choice: ambiguous parsers (overlap on {%s})" - (print_char_set (CharSet.inter set1 set2))); - let map = map_add_set CharMap.empty set1 a in - let map = map_add_set map set2 b in - make_switch_c map None - | IsFail e, _ | _, IsFail e -> raise (ExnIsFail e) - | Set s, NoChar -> make_switch_c (map_add_set CharMap.empty s a) (Some b) - | NoChar, Set s -> make_switch_c (map_add_set CharMap.empty s b) (Some a) - | AllChars, _ | _, AllChars -> - invalid_arg "choice: ambiguous parsers (one accepts everything)" - | (NoChar | NoCharOrSet _), (NoChar | NoCharOrSet _) -> - invalid_arg "choice: ambiguous parsers (both accept nothing)" - end - end - with ExnIsFail msg -> make_c (C_Fail msg) - -let rec choice = function - | [] -> invalid_arg "choice: empty list"; - | [x] -> x - | a :: tl -> merge a (choice tl) - -(* temporary structure for buildings switches *) -type 'a trie = - | TrieLeaf of 'a t - | TrieNode of 'a trie CharMap.t - -let trie_empty = TrieNode CharMap.empty - -let rec parser_of_trie : type a. a trie -> a t = function - | TrieLeaf p -> p - | TrieNode m -> - make_switch_c (CharMap.map parser_of_trie' m) None -(* consume next char, then build sub-trie *) -and parser_of_trie' - : type a. a trie -> a t - = fun x -> app_right junk (parser_of_trie x) - -(* build prefix trie *) -let switch_s l = - if l = [] then invalid_arg "switch_s: empty list"; - (* add parser p in trie [t], with key slice of [s] starting at [i] *) - let rec add_trie t s i p = - if i = String.length s - then match t with - | TrieNode m when CharMap.is_empty m -> TrieLeaf p - | TrieNode _ -> invalid_arg (str "key \"%s\" is prefix of another key" s) - | TrieLeaf _ -> invalid_arg (str "duplicate key \"%s\"" s) - else - let c = String.get s i in - match t with - | TrieLeaf _ -> - invalid_arg (str "key \"%s\" is prefixed by another key" s) - | TrieNode map -> - try - let sub = CharMap.find c map in - let sub = add_trie sub s (i+1) p in - TrieNode (CharMap.add c sub map) - with Not_found -> - let sub = add_trie trie_empty s (i+1) p in - TrieNode (CharMap.add c sub map) - in - let trie = - List.fold_left - (fun trie (s, p) -> - if s = "" then invalid_arg "switch_s: empty string"; - add_trie trie s 0 p - ) trie_empty l - in - parser_of_trie trie - -let bool = - switch_s - [ "true", return true - ; "false", return false - ] - -let fix f = - (* outermost lazy needed for the recursive definition *) - let rec r = { - st=Parse (Lazy (lazy (f r))); - } in - r - -module Infix = struct - let (>|=) x f = map f x - let (<*>) = app - let (<<) = app_left - let (>>) = app_right - let (<+>) a b = choice [a; b] - let (<::>) a b = pure (fun x l -> x::l) <*> a <*> b -end - -include Infix - -let word = - pure (fun c s -> str_of_l (c :: s)) <*> alpha <*> many alpha_num - -let quoted = - let q = char '"' in - let escaped = char '\\' >> char '"' in - let inner = choice [escaped; alpha_num; any_of "()' \t\n|!;$#@%&-_/=~.,:<>[]"] in - q >> (many inner >|= str_of_l) << q - -(** {2 Compilation} *) - -let encode_cons x sep tl = pure (fun x _sep tl -> x :: tl) <*> x <*> sep <*> tl - -let encode_many - : type a. set:CharSet.t -> p:a t -> self:a list t -> sep:unit t -> a list t - = fun ~set ~p ~self ~sep -> - let on_success = encode_cons p sep self - and on_fail = pure [] in - make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail) - -let encode_opt ~set x = - let mk_one x = [x] in - let on_success = make_c (C_Map (mk_one, x)) - and on_fail = pure [] in - make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail) - -let encode_skip - : type a. set:CharSet.t -> p:a t -> self:unit t -> unit t - = fun ~set ~p ~self -> - let on_success = p >> self - and on_fail = pure () in - make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail) - -let many_ - : type a. sep:unit t -> mult:multiplicity -> p:a t -> a list t - = fun ~sep ~mult ~p -> match possible_first_chars p with - | Set set -> - begin match mult with - | Star -> fix (fun self -> encode_many ~set ~sep ~p ~self) - | Plus -> encode_cons p sep (fix (fun self -> encode_many ~set ~sep ~p ~self)) - | Question -> encode_opt ~set p - end - | IsFail msg -> fail msg - | NoCharOrSet _ -> invalid_arg (str "many: invalid parser (might not consume input)") - | AllChars -> invalid_arg (str "many: invalid parser (always succeeds)") - | NoChar -> invalid_arg (str "many: invalid parser (does not consume input)") - -let skip_ : type a. mult:multiplicity -> p:a t -> unit t - = fun ~mult ~p -> match possible_first_chars p with - | Set set -> - begin match mult with - | Star -> fix (fun self -> encode_skip ~set ~p ~self) - | Plus -> p >> fix (fun self -> encode_skip ~set ~p ~self) - | Question -> encode_opt ~set p >> pure () - end - | IsFail msg -> fail msg - | NoCharOrSet _ -> invalid_arg (str "many: invalid parser (might not consume input)") - | AllChars -> invalid_arg (str "skip: invalid parser (always succeeds)") - | NoChar -> invalid_arg (str "skip: invalid parser (does not consume input)") - -let rec compile - : type a. a t -> a compiled - = fun t -> match t.st with - | Compiled c -> c (* already compiled *) - | Parse (Many (p, sep, mult)) -> - let c = compile (many_ ~sep ~mult ~p) in - t.st <- Compiled c; - c - | Parse (Skip (p, mult)) -> - let c = compile (skip_ ~mult ~p) in - t.st <- Compiled c; - c - | Parse (Lazy (lazy p)) -> - let c = compile p in - t.st <- Compiled c; - c - -(** {2 Signatures} *) - -type error = { - line: int; - col: int; - msg: string; -} - -let string_of_error e = str "at %d:%d; %s" e.line e.col e.msg - -exception Error of error - -module type S = sig - type source - (** Source of characters *) - - val parse : source -> 'a t -> ('a, error) result - (** Parse the given source using the parser, and returns the parsed value. *) - - val parse': source -> 'a t -> ('a, string) result - (** Same as {!parse}, but returns a user-friendly string in case of failure *) - - val parse_exn : source -> 'a t -> 'a - (** Unsafe version of {!parse}. - @raise Error if parsing fails *) -end - -(** {2 Build a parser from a given Monadic Input} *) - -module type INPUT = sig - type t - - val read : t -> Bytes.t -> int -> int -> int -end - -type token = - | Yield of char - | EOF - -module type READER = sig - type t - type source - - val create : source -> t - val peek : t -> token (* peek; do not consume *) - val next : t -> token (* read and consume *) - val junk : t -> unit (* consume last token, obtained with junk *) - val line : t -> int - val col : t -> int -end - -module ReaderOfInput(I : INPUT) : READER with type source = I.t = struct - type t = { - mutable rline : int; - mutable rcol : int; - input : I.t; - buf : Bytes.t; - mutable i : int; - mutable len : int; - } - type source = I.t - - let line t = t.rline - let col t = t.rcol - - let create input = { - rline=1; - rcol=0; - input; - buf = Bytes.make 1024 ' '; - i=1; - len=1; (* trick for initialization *) - } - - let read_next t = - let c = Bytes.get t.buf t.i in - t.i <- t.i + 1; - if c = '\n' then ( - t.rcol <- 0; - t.rline <- t.rline + 1; - ) else ( - t.rcol <- t.rcol + 1 - ); - Yield c - - let refill t = - t.len <- I.read t.input t.buf 0 (Bytes.length t.buf); - t.i <- 0; - () - - let next t = - if t.len = 0 then EOF - else if t.i = t.len - then ( - refill t; - if t.len = 0 then EOF else read_next t - ) else read_next t - - let peek t = - if t.i = t.len - then refill t; - Yield (Bytes.get t.buf t.i) - - let junk t = - assert (t.len > 0 && t.i < t.len); - t.i <- t.i + 1 -end - -module MakeFromReader(R : READER) : S with type source = R.source = struct - type source = R.source - - let error r msg = - raise (Error { - line = R.line r; - col = R.col r; - msg; - }) - let errorf r fmt = - Printf.ksprintf - (fun msg -> error r msg) - fmt - - let is_int c = Char.code c >= Char.code '0' && Char.code c <= Char.code '9' - let to_int c = Char.code c - Char.code '0' - - let rec parse_int r ~sign i = match R.peek r with - | EOF -> i - | Yield c when is_int c -> - R.junk r; - parse_int r ~sign (10 * i + to_int c) - | Yield '-' when i = 0 && sign -> - (* switch sign: only on first char *) - R.junk r; - parse_int r ~sign:false 0 - | _ -> if sign then i else -i - - let parse_float _r _buf = assert false - - let rec parse_rec : type a. R.t -> a t -> a = - fun r p -> match compile p with - | C_Return x -> x - | C_Map (f, x) -> - let y = parse_rec r x in - f y - | C_Filter (f, x) -> - let y = parse_rec r x in - if f y then y else errorf r "filter failed" - | C_App (f, x) -> - let f' = parse_rec r f in - let x' = parse_rec r x in - f' x' - | C_AppLeft (a, b) -> - let a' = parse_rec r a in - let _ = parse_rec r b in - a' - | C_AppRight (a, b) -> - let _ = parse_rec r a in - let b' = parse_rec r b in - b' - | C_Fail msg -> error r msg - | C_Int -> parse_int r ~sign:true 0 - | C_Float -> parse_float r (Buffer.create 8) - | C_Junk -> R.junk r - | C_AnyOf set -> - begin match R.next r with - | EOF -> errorf r "expected any of %s, got EOF" (print_char_set set) - | Yield c -> - if CharSet.mem c set then c - else errorf r "expected any of %s, got '%s'" (print_char_set set) (print_char c) - end - | C_SwitchC (map, def) -> - begin match R.peek r with - | EOF -> errorf r "expected any of %s, got EOF" (print_char_map map) - | Yield c -> - begin try - let p' = CharMap.find c map in - parse_rec r p' - with Not_found -> match def with - | None -> - errorf r "expected any of %s, got %c" (print_char_map map) c - | Some d -> parse_rec r d - end - end - | C_Eof -> - begin match R.next r with - | EOF -> () - | Yield c -> errorf r "expected EOF, got %c" c - end - - (* public functions *) - let parse_exn src p = - let r = R.create src in - parse_rec r p - - let parse src p = - let r = R.create src in - try - `Ok (parse_rec r p) - with Error e -> - `Error e - - let parse' src p = match parse src p with - | `Ok x -> `Ok x - | `Error e -> `Error (string_of_error e) -end - -module Make(I : INPUT) = struct - module R = ReaderOfInput(I) - include MakeFromReader(R) -end - -module Str = MakeFromReader(struct - (* reader of string *) - type t = { - str : string; - mutable i : int; - mutable rcol : int; - mutable rline : int; - } - type source = string - - let create str = { - str; - i = 0; - rcol = 1; - rline = 1; - } - let line t = t.rline - let col t = t.rcol - let peek t = - if t.i = String.length t.str then EOF else Yield (String.get t.str t.i) - let junk t = - assert (t.i < String.length t.str); - t.i <- t.i + 1 - let next t = - if t.i = String.length t.str then EOF - else ( - let c = String.get t.str t.i in - t.i <- t.i + 1; - if c = '\n' then ( - t.rcol <- 1; - t.rline <- t.rline + 1 - ) else t.rcol <- t.rcol + 1; - Yield c - ) -end) - -module Chan = Make(struct - type t = in_channel - let read = input -end) diff --git a/src/string/CCApp_parse.mli b/src/string/CCApp_parse.mli deleted file mode 100644 index 280a2d90..00000000 --- a/src/string/CCApp_parse.mli +++ /dev/null @@ -1,272 +0,0 @@ -(* -copyright (c) 2013-2015, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - -(** {1 Applicative Parser Combinators} - - Example: basic S-expr parser - -{[ - open Containers_string.App_parse;; - - type sexp = Atom of string | List of sexp list;; - - let mkatom a = Atom a;; - let mklist l = List l;; - - let ident_char = alpha_num <+> any_of "|!;$#@%&-_/=*.:~+[]<>'" ;; - let ident = many1 ident_char >|= str_of_l ;; - let atom = (ident <+> quoted) >|= mkatom ;; - - let sexp = fix (fun sexp -> - white >> - (atom <+> - ((char '(' >> many sexp << char ')') >|= mklist) - ) - );; - - Str.parse_exn "(a (b c d) e)" sexp;; - -]} - -@deprecated CCParse is more expressive and stable - -{b status: deprecated} -@since 0.10 -*) - -type ('a,'b) result = [`Error of 'b | `Ok of 'a] - -type 'a t -(** Parser that yields an error or a value of type 'a *) - -(** {6 Combinators} *) - -val return : 'a -> 'a t -(** Parser that succeeds with the given value *) - -val pure : 'a -> 'a t -(** Synonym to {!return} *) - -val junk : unit t -(** Skip next char *) - -val fail : string -> 'a t -(** [fail msg] fails with the given error message *) - -val failf : ('a, unit, string, 'b t) format4 -> 'a - -val app : ('a -> 'b) t -> 'a t -> 'b t -(** Applicative *) - -val map : ('a -> 'b) -> 'a t -> 'b t -(** Map the parsed value *) - -val int : int t -(** Parse an integer *) - -val float : float t -(** Parse a floating point number *) - -val bool : bool t -(** Parse "true" or "false" *) - -val char : char -> char t -(** [char c] parses [c] and [c] only *) - -val any_of : string -> char t -(** Parse any of the chars present in the given string *) - -val alpha_lower : char t - -val alpha_upper : char t - -val alpha : char t - -val symbols : char t -(** Symbols, such as "!-=_"... *) - -val num : char t - -val alpha_num : char t - -val word : string t -(** [word] parses any identifier not starting with an integer and - not containing any whitespace nor delimiter - TODO: specify *) - -val quoted : string t -(** Quoted string, following OCaml conventions *) - -val str_of_l : char list -> string -(** Helper to build strings from lists of chars *) - -val spaces : unit t -(** Parse a sequence of ['\t'] and [' '] *) - -val spaces1 : unit t -(** Same as {!spaces} but requires at least one space *) - -val white : unit t -(** Parse a sequence of ['\t'], ['\n'] and [' '] *) - -val white1 : unit t - -val eof : unit t -(** Matches the end of input, fails otherwise *) - -val many : ?sep:unit t -> 'a t -> 'a list t -(** 0 or more parsed elements of the given type. - @param sep separator between elements of the list (for instance, {!space}) *) - -val many1 : ?sep:unit t -> 'a t -> 'a list t -(** Same as {!many}, but needs at least one element *) - -val skip : _ t -> unit t -(** Skip 0 or more instances of the given parser *) - -val skip1 : _ t -> unit t - -val opt : 'a t -> 'a option t -(** [opt x] tries to parse [x], and returns [None] otherwise *) - -val filter : ('a -> bool) -> 'a t -> 'a t -(** [filter f p] parses the same as [p], but fails if the returned value - does not satisfy [f] *) - - -(* TODO: complement operator any_but (all but \, for instance) *) -(* TODO: a "if-then-else" combinator (assuming the test has a - set of possible first chars) *) - -val switch_c : ?default:'a t -> (char * 'a t) list -> 'a t -(** [switch_c l] matches the next char and uses the corresponding parser. - Fails if the next char is not in the list, unless default is defined. - @param default parser to use if no char matches - @raise Invalid_argument if some char occurs several times in [l] *) - -val switch_s : (string * 'a t) list -> 'a t -(** [switch_s l] attempts to match matches any of the strings in [l]. - If one of those strings matches, the corresponding parser - is used from now on. - @raise Invalid_argument if some string is a prefix of another string, - or is empty, or if the list is empty *) - -val choice : 'a t list -> 'a t -(** [choice l] chooses between the parsers, unambiguously - @raise Invalid_argument if the list is empty, or if some parsers - overlap, making the choice ambiguous *) - -val fix : ('a t -> 'a t) -> 'a t -(** [fix f] makes a fixpoint *) - -module Infix : sig - val (>|=) : 'a t -> ('a -> 'b) -> 'b t - (** Infix version of {!map} *) - - val (<*>) : ('a -> 'b) t -> 'a t -> 'b t - (** Synonym to {!app} *) - - val (>>) : _ t -> 'a t -> 'a t - (** [a >> b] parses [a], ignores its result, then parses [b] *) - - val (<<) : 'a t -> _ t -> 'a t - (** [a << b] parses [a], then [b], and discards [b] to return [a] *) - - val (<+>) : 'a t -> 'a t -> 'a t - (** [a <+> b] is [choice [a;b]], a binary choice *) - - val (<::>) : 'a t -> 'a list t -> 'a list t - (** [a <::> b] is [app (fun x l -> x::l) a b] *) -end - -include module type of Infix - -(** {2 Signatures} *) - -(** {6 Parsing} *) - -type error = { - line: int; - col: int; - msg: string; -} - -val string_of_error : error -> string - -exception Error of error - -module type S = sig - type source - (** Source of characters *) - - val parse : source -> 'a t -> ('a, error) result - (** Parse the given source using the parser, and returns the parsed value. *) - - val parse': source -> 'a t -> ('a, string) result - (** Same as {!parse}, but returns a user-friendly string in case of failure *) - - val parse_exn : source -> 'a t -> 'a - (** Unsafe version of {!parse}. - @raise Error if parsing fails *) -end - -(** {2 Parse} *) - -module type INPUT = sig - type t - - val read : t -> Bytes.t -> int -> int -> int -end - -module Make(I : INPUT) : S with type source = I.t - -(** {2 Low-level interface} *) - -val print : Format.formatter -> _ t -> unit -(** Print a parser structure, for debug purpose *) - -type token = - | Yield of char - | EOF - -module type READER = sig - type t - type source (* underlying source *) - - val create : source -> t - val peek : t -> token (* peek; do not consume *) - val next : t -> token (* read and consume *) - val junk : t -> unit (* consume last token, obtained with junk *) - val line : t -> int - val col : t -> int -end - -module MakeFromReader(R : READER) : S with type source = R.source - -(** {2 Defaults} *) - -module Str : S with type source = string - -module Chan : S with type source = in_channel diff --git a/src/string/CCLevenshtein.ml b/src/string/CCLevenshtein.ml deleted file mode 100644 index 56f822a9..00000000 --- a/src/string/CCLevenshtein.ml +++ /dev/null @@ -1,823 +0,0 @@ -(* -copyright (c) 2013, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - -(** {1 Levenshtein distance} *) - -type 'a sequence = ('a -> unit) -> unit -type 'a gen = unit -> 'a option - -module type STRING = sig - type char_ - type t - - val of_list : char_ list -> t - val get : t -> int -> char_ - val length : t -> int - val compare_char : char_ -> char_ -> int -end - -(** Continuation list *) -type 'a klist = unit -> - [ - | `Nil - | `Cons of 'a * 'a klist - ] - -let rec klist_to_list l = match l () with - | `Nil -> [] - | `Cons (x,k) -> x :: klist_to_list k - -(*$inject - open CCFun - - let list_uniq_ = Q.( - let gen = Gen.(list_size (0 -- 100) (string_size ~gen:printable (1 -- 10)) - >|= CCList.sort_uniq ~cmp:String.compare - >|= List.map (fun s->s,s) - ) in - let print = Print.(list (pair string string)) in - let shrink = Shrink.(list ~shrink:(pair string string)) in - make ~small:List.length ~print ~shrink gen - ) -*) - -(*$Q - Q.(string_of_size Gen.(0 -- 30)) (fun s -> \ - let a = of_string ~limit:1 s in \ - match_with a s) -*) - -(* test that building a from s, and mutating one char of s, yields - a string s' that is accepted by a. - - --> generate triples (s, i, c) where c is a char, s a non empty string - and i a valid index in s -*) - -(*$QR - ( - let gen = Q.Gen.( - 3 -- 10 >>= fun len -> - 0 -- (len-1) >>= fun i -> - string_size (return len) >>= fun s -> - char >|= fun c -> (s,i,c) - ) in - let small (s,_,_) = String.length s in - Q.make ~small gen - ) - (fun (s,i,c) -> - let s' = Bytes.of_string s in - Bytes.set s' i c; - let a = of_string ~limit:1 s in - match_with a (Bytes.to_string s') - ) -*) - -(* test that, for an index, all retrieved strings are at a distance to - the key that is not too high *) -(*$QR & ~count:30 - ( - let mklist l = - let l' = List.map (fun s->s,s) l in - l, Index.of_list l' - in - let gen = Q.Gen.( - list_size (3 -- 15) (string_size (1 -- 10)) >|= mklist - ) in - let small (l,_) = List.length l in - let print (l,_) = Q.Print.(list string) l in - let shrink (l,_) = Sequence.map mklist (Q.Shrink.list l) in - Q.make ~small ~print ~shrink gen - ) - (fun (l,idx) -> - List.for_all - (fun s -> - let retrieved = Index.retrieve ~limit:2 idx s - |> klist_to_list in - List.for_all - (fun s' -> edit_distance s s' <= 2) retrieved && - List.for_all - (fun s' -> not (edit_distance s s' <= 2) || List.mem s' retrieved) - l - ) l - ) - -*) - -(*$R -let idx = Index.of_list ["aa", "aa"; "ab", "ab"; "cd", "cd"; "a'c", "a'c"] in - assert_equal ~printer:Q.Print.(list string) - ["a'c"; "aa"; "ab"] - (Index.retrieve ~limit:1 idx "ac" |> CCKList.to_list - |> List.sort Pervasives.compare) -*) - -module type S = sig - type char_ - type string_ - - (** {6 Edit Distance} *) - - val edit_distance : string_ -> string_ -> int - (** Edition distance between two strings. This satisfies the classical - distance axioms: it is always positive, symmetric, and satisfies - the formula [distance a b + distance b c >= distance a c] *) - - (** {6 Automaton} - An automaton, built from a string [s] and a limit [n], that accepts - every string that is at distance at most [n] from [s]. *) - - type automaton - (** Levenshtein automaton *) - - val of_string : limit:int -> string_ -> automaton - (** Build an automaton from a string, with a maximal distance [limit]. - The automaton will accept strings whose {!edit_distance} to the - parameter is at most [limit]. *) - - val of_list : limit:int -> char_ list -> automaton - (** Build an automaton from a list, with a maximal distance [limit] *) - - val debug_print : (out_channel -> char_ -> unit) -> - out_channel -> automaton -> unit - (** Output the automaton's structure on the given channel. *) - - val match_with : automaton -> string_ -> bool - (** [match_with a s] matches the string [s] against [a], and returns - [true] if the distance from [s] to the word represented by [a] is smaller - than the limit used to build [a] *) - - (** {6 Index for one-to-many matching} *) - - module Index : sig - type 'b t - (** Index that maps strings to values of type 'b. Internally it is - based on a trie. A string can only map to one value. *) - - val empty : 'b t - (** Empty index *) - - val is_empty : _ t -> bool - - val add : 'b t -> string_ -> 'b -> 'b t - (** Add a pair string/value to the index. If a value was already present - for this string it is replaced. *) - - val cardinal : _ t -> int - (** Number of bindings *) - - val remove : 'b t -> string_ -> 'b t - (** Remove a string (and its associated value, if any) from the index. *) - - val retrieve : limit:int -> 'b t -> string_ -> 'b klist - (** Lazy list of objects associated to strings close to the query string *) - - val of_list : (string_ * 'b) list -> 'b t - (** Build an index from a list of pairs of strings and values *) - - val to_list : 'b t -> (string_ * 'b) list - (** Extract a list of pairs from an index *) - - val add_seq : 'a t -> (string_ * 'a) sequence -> 'a t - (** @since 0.14 *) - - val of_seq : (string_ * 'a) sequence -> 'a t - (** @since 0.14 *) - - val to_seq : 'a t -> (string_ * 'a) sequence - (** @since 0.14 *) - - val add_gen : 'a t -> (string_ * 'a) gen -> 'a t - (** @since 0.14 *) - - val of_gen : (string_ * 'a) gen -> 'a t - (** @since 0.14 *) - - val to_gen : 'a t -> (string_ * 'a) gen - (** @since 0.14 *) - - val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a - (** Fold over the stored pairs string/value *) - - val iter : (string_ -> 'b -> unit) -> 'b t -> unit - (** Iterate on the pairs *) - - val to_klist : 'b t -> (string_ * 'b) klist - (** Conversion to an iterator *) - end -end - -module Make(Str : STRING) -: S with type char_ = Str.char_ and type string_ = Str.t = struct - type string_ = Str.t - type char_ = Str.char_ - - let edit_distance s1 s2 = - if Str.length s1 = 0 - then Str.length s2 - else if Str.length s2 = 0 - then Str.length s1 - else if s1 = s2 - then 0 - else begin - (* distance vectors (v0=previous, v1=current) *) - let v0 = Array.make (Str.length s2 + 1) 0 in - let v1 = Array.make (Str.length s2 + 1) 0 in - (* initialize v0: v0(i) = A(0)(i) = delete i chars from t *) - for i = 0 to Str.length s2 do - v0.(i) <- i - done; - (* main loop for the bottom up dynamic algorithm *) - for i = 0 to Str.length s1 - 1 do - (* first edit distance is the deletion of i+1 elements from s *) - v1.(0) <- i+1; - - (* try add/delete/replace operations *) - for j = 0 to Str.length s2 - 1 do - let cost = if Str.compare_char (Str.get s1 i) (Str.get s2 j) = 0 then 0 else 1 in - v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost)); - done; - - (* copy v1 into v0 for next iteration *) - Array.blit v1 0 v0 0 (Str.length s2 + 1); - done; - v1.(Str.length s2) - end - - module NDA = struct - type char = - | Any - | Char of char_ - type transition = - | Success - | Upon of char * int * int - | Epsilon of int * int - - (* non deterministic automaton *) - type _t = transition list array array - - let length nda = Array.length nda - - let rec mem_tr tr l = match tr, l with - | _, [] -> false - | Success, Success::_ -> true - | Epsilon (i,j), Epsilon(i',j')::_ -> i=i' && j=j' - | Upon (Any,i,j), Upon(Any,i',j')::_ when i=i' && j=j' -> true - | Upon (Char c,i,j), Upon(Char c',i',j')::_ - when Str.compare_char c c' = 0 && i=i' && j=j' -> true - | _, _::l' -> mem_tr tr l' - - (* build NDA from the string *) - let make ~limit s = - let len = Str.length s in - let m = Array.make_matrix (len +1) (limit+1) [] in - let add_transition i j tr = - if not (mem_tr tr m.(i).(j)) - then m.(i).(j) <- tr :: m.(i).(j) - in - (* internal transitions *) - for i = 0 to len-1 do - for j = 0 to limit do - (* correct char *) - add_transition i j (Upon (Char (Str.get s i), i+1, j)); - (* other transitions *) - if j < limit then begin - (* substitution *) - add_transition i j (Upon (Any, i+1, j+1)); - (* deletion in indexed string *) - add_transition i j (Upon (Any, i, j+1)); - (* addition to indexed string *) - add_transition i j (Epsilon (i+1, j+1)); - end - done - done; - for j = 0 to limit do - (* deletions at the end *) - if j < limit - then add_transition len j (Upon (Any, len, j+1)); - (* win in any case *) - add_transition len j Success; - done; - m - - let get nda (i,j) = - nda.(i).(j) - - let is_final nda (i,j) = - List.exists - (function Success -> true | _ -> false) - (get nda (i,j)) - end - - (** deterministic automaton *) - module DFA = struct - type t = { - mutable transitions : (char_ * int) list array; - mutable is_final : bool array; - mutable otherwise : int array; (* transition by default *) - mutable len : int; - } - - let create size = { - len = 0; - transitions = Array.make size []; - is_final = Array.make size false; - otherwise = Array.make size ~-1; - } - - let _double_array ~init a = - let a' = Array.make (2 * Array.length a) init in - Array.blit a 0 a' 0 (Array.length a); - a' - - (* add a new state *) - let add_state dfa = - let n = dfa.len in - (* resize *) - if n = Array.length dfa.transitions then begin - dfa.transitions <- _double_array ~init:[] dfa.transitions; - dfa.is_final <- _double_array ~init:false dfa.is_final; - dfa.otherwise <- _double_array ~init:~-1 dfa.otherwise; - end; - dfa.len <- n + 1; - n - - let rec __mem_tr tr l = match tr, l with - | _, [] -> false - | (c,i), (c',i')::l' -> - (i=i' && compare c c' = 0) - || __mem_tr tr l' - - (* add transition *) - let add_transition dfa i tr = - if not (__mem_tr tr dfa.transitions.(i)) - then dfa.transitions.(i) <- tr :: dfa.transitions.(i) - - let add_otherwise dfa i j = - dfa.otherwise.(i) <- j - - let set_final dfa i = - dfa.is_final.(i) <- true - - (* set of pairs of ints: used for representing a set of states of the NDA *) - module NDAStateSet = Set.Make(struct - type t = int * int - let compare = Pervasives.compare - end) - - let _set_to_string s = - let b = Buffer.create 15 in - Buffer.add_char b '{'; - NDAStateSet.iter - (fun (x,y) -> Printf.bprintf b "(%d,%d)" x y) - s; - Buffer.add_char b '}'; - Buffer.contents b - - (* list of characters that can specifically be followed from the given set *) - let chars_from_set nda set = - NDAStateSet.fold - (fun state acc -> - let transitions = NDA.get nda state in - List.fold_left - (fun acc tr -> match tr with - | NDA.Upon (NDA.Char c, _, _) -> - if List.exists (fun c' -> Str.compare_char c c' = 0) acc - then acc - else c :: acc (* new char! *) - | _ -> acc - ) acc transitions - ) set [] - - (* saturate current set w.r.t epsilon links *) - let saturate_epsilon nda set = - let q = Queue.create () in - NDAStateSet.iter (fun s -> Queue.push s q) set; - let set = ref set in - while not (Queue.is_empty q) do - let state = Queue.pop q in - (*Printf.printf "saturate epsilon: add state %d,%d\n" (fst state)(snd state);*) - set := NDAStateSet.add state !set; - List.iter - (fun tr' -> match tr' with - | NDA.Epsilon (i,j) -> - if not (NDAStateSet.mem (i,j) !set) - then Queue.push (i,j) q - | _ -> () - ) (NDA.get nda state) - done; - !set - - (* find the transition that matches the given char (if any), or "*"; - may raise exceptions Not_found or LeadToSuccess. *) - let rec get_transition_for_char nda c acc transitions = - match transitions with - | NDA.Upon (NDA.Char c', i, j) :: transitions' when Str.compare_char c c' = 0 -> - (* follow same char *) - let acc = NDAStateSet.add (i, j) acc in - get_transition_for_char nda c acc transitions' - | NDA.Upon (NDA.Any, i, j) :: transitions' -> - (* follow '*' *) - let acc = NDAStateSet.add (i,j) acc in - get_transition_for_char nda c acc transitions' - | _ :: transitions' -> get_transition_for_char nda c acc transitions' - | [] -> acc - - let rec get_transitions_for_any nda acc transitions = - match transitions with - | NDA.Upon (NDA.Char _, _, _) :: transitions' -> - get_transitions_for_any nda acc transitions' - | NDA.Upon (NDA.Any, i, j) :: transitions' -> - let acc = NDAStateSet.add (i,j) acc in - get_transitions_for_any nda acc transitions' - | _:: transitions' -> get_transitions_for_any nda acc transitions' - | [] -> acc - - (* follow transition for given NDA.char, returns a new state - and a boolean indicating whether it's final *) - let follow_transition nda set c = - let set' = NDAStateSet.fold - (fun state acc -> - let transitions = NDA.get nda state in - (* among possible transitions, follow the one that matches c - the most closely *) - get_transition_for_char nda c acc transitions - ) set NDAStateSet.empty - in - saturate_epsilon nda set' - - let follow_transition_any nda set = - let set' = NDAStateSet.fold - (fun state acc -> - let transitions = NDA.get nda state in - (* among possible transitions, follow the ones that are labelled with "*" *) - get_transitions_for_any nda acc transitions - ) set NDAStateSet.empty - in - saturate_epsilon nda set' - - (* call [k] with every [transition'] that can be reached from [set], with - a bool that states whether it's final *) - let iterate_transition_set nda set k = - (*Printf.printf "iterate_transition at set %s\n" (set_to_string set);*) - (* all possible "fixed char" transitions *) - let chars = chars_from_set nda set in - List.iter - (fun c -> - (*Printf.printf "iterate_transition follows %c (at %s)\n" - (Obj.magic c) (set_to_string set);*) - let set' = follow_transition nda set c in - if not (NDAStateSet.is_empty set') - then k (NDA.Char c) set'; - ) chars; - (* remaining transitions, with only "Any" *) - (*Printf.printf "iterate transition follows * (at %s)\n" (set_to_string set);*) - let set' = follow_transition_any nda set in - if not (NDAStateSet.is_empty set') - then k NDA.Any set' - - module StateSetMap = Map.Make(NDAStateSet) - - (* get the state that corresponds to the given set of NDA states. - [states] is a map [nda set] -> [nfa state] *) - let get_state dfa states set = - try StateSetMap.find set !states - with Not_found -> - let i = add_state dfa in - states := StateSetMap.add set i !states; - i - - (* traverse the NDA. Currently we're at [set] *) - let rec traverse nda dfa states set = - let set_i = get_state dfa states set in - (* does this set lead to success? *) - let is_final = NDAStateSet.exists (NDA.is_final nda) set in - if is_final - then set_final dfa set_i; - iterate_transition_set nda set - (fun c set' -> - (*Printf.printf "traverse %s --%c--> %s\n" (set_to_string set) - (match c with NDA.Char c' -> Obj.magic c' | NDA.Any -> '*') - (set_to_string set');*) - let set_i' = get_state dfa states set' in - (* link set -> set' *) - match c with - | NDA.Char c' -> - add_transition dfa set_i (c', set_i'); - traverse nda dfa states set' - | NDA.Any -> - add_otherwise dfa set_i set_i'; - traverse nda dfa states set' - ) - - let of_nda nda = - let dfa = create (NDA.length nda) in - (* map (set of NDA states) to int (state in DFA) *) - let states = ref StateSetMap.empty in - (* traverse the NDA to build the NFA *) - let set = NDAStateSet.singleton (0,0) in - let set = saturate_epsilon nda set in - traverse nda dfa states set; - (*StateSetMap.iter - (fun set i -> - Printf.printf "set %s --> state %d\n" (set_to_string set) i - ) !states;*) - dfa - - let get dfa i = - dfa.transitions.(i) - - let otherwise dfa i = - dfa.otherwise.(i) - - let is_final dfa i = - dfa.is_final.(i) - end - - let debug_print pp_char oc dfa = - Printf.fprintf oc "automaton of %d states\n" dfa.DFA.len; - for i = 0 to dfa.DFA.len-1 do - let transitions = DFA.get dfa i in - if DFA.is_final dfa i - then Printf.fprintf oc " success %d\n" i; - List.iter - (fun (c, j) -> Printf.fprintf oc " %d --%a--> %d\n" i pp_char c j ) transitions; - let o = DFA.otherwise dfa i in - if o >= 0 - then Printf.fprintf oc " %d --*--> %d\n" i o - done - - type automaton = DFA.t - - let of_string ~limit s = - let nda = NDA.make ~limit s in - let dfa = DFA.of_nda nda in - dfa - - let of_list ~limit l = - of_string ~limit (Str.of_list l) - - let rec __find_char c l = match l with - | [] -> raise Not_found - | (c', next) :: l' -> - if compare c c' = 0 - then next - else __find_char c l' - - (* transition for [c] in state [i] of [dfa]; - @raise Not_found if no transition matches *) - let __transition dfa i c = - let transitions = DFA.get dfa i in - try - __find_char c transitions - with Not_found -> - let o = DFA.otherwise dfa i in - if o >= 0 - then o - else raise Not_found - - let match_with dfa a = - let len = Str.length a in - let rec search i state = - (*Printf.printf "at state %d (dist %d)\n" i dist;*) - if i = len - then DFA.is_final dfa state - else begin - (* current char *) - let c = Str.get a i in - try - let next = __transition dfa state c in - search (i+1) next - with Not_found -> false - end - in - search 0 0 - - (** {6 Index for one-to-many matching} *) - - module Index = struct - type key = char_ - - module M = Map.Make(struct - type t = key - let compare = Str.compare_char - end) - - type 'b t = - | Node of 'b option * 'b t M.t - - let empty = Node (None, M.empty) - - let is_empty = function - | Node (None, m) -> M.is_empty m - | _ -> false - - let () = assert (is_empty empty) - - (** get/add/remove the leaf for the given array. - the continuation k takes the leaf, and returns a leaf option - that replaces the old leaf. - This function returns the new trie. *) - let goto_leaf s node k = - let len = Str.length s in - (* insert the value in given [node], assuming the current index - in [arr] is [i]. [k] is given the resulting tree. *) - let rec goto node i rebuild = match node with - | _ when i = len -> - let node' = k node in - rebuild node' - | Node (opt, m) -> - let c = Str.get s i in - let t' = - try M.find c m - with Not_found -> empty - in - goto t' (i+1) - (fun t'' -> - if is_empty t'' - then rebuild (Node (opt, M.remove c m)) - else rebuild (Node (opt, M.add c t'' m))) - in - goto node 0 (fun t -> t) - - let add trie s value = - goto_leaf s trie - (function - | Node (_, m) -> Node (Some value, m)) - - let remove trie s = - goto_leaf s trie - (function - | Node (_, m) -> Node (None, m)) - - (* traverse the automaton and the idx, yielding a klist of values *) - let retrieve ~limit idx s = - let dfa = of_string ~limit s in - (* traverse at index i in automaton, with - [fk] the failure continuation *) - let rec traverse node i ~(fk:'a klist) () = - match node with - | Node (opt, m) -> - (* all alternatives: continue exploring [m], or call [fk] *) - let fk = - M.fold - (fun c node' fk -> - try - let next = __transition dfa i c in - traverse node' next ~fk - with Not_found -> fk) - m fk - in - match opt with - | Some v when DFA.is_final dfa i -> - (* yield one solution now *) - `Cons (v, fk) - | _ -> fk () (* fail... or explore subtrees *) - in - traverse idx 0 ~fk:(fun () -> `Nil) - - let of_list l = - List.fold_left - (fun acc (arr,v) -> add acc arr v) - empty l - - let fold f acc idx = - let rec explore acc trail node = match node with - | Node (opt, m) -> - (* first, yield current value, if any *) - let acc = match opt with - | None -> acc - | Some v -> - let str = Str.of_list (List.rev trail) in - f acc str v - in - M.fold - (fun c node' acc -> explore acc (c::trail) node') - m acc - in - explore acc [] idx - - let iter f idx = - fold (fun () str v -> f str v) () idx - - let cardinal idx = fold (fun n _ _ -> n+1) 0 idx - - let to_list idx = - fold (fun acc str v -> (str,v) :: acc) [] idx - - let add_seq i s = - let i = ref i in - s (fun (arr,v) -> i := add !i arr v); - !i - - let of_seq s = add_seq empty s - - let to_seq i yield = iter (fun x y -> yield (x,y)) i - - (*$Q - list_uniq_ (fun l -> \ - Sequence.of_list l |> Index.of_seq |> Index.to_seq \ - |> Sequence.to_list |> List.sort Pervasives.compare \ - = List.sort Pervasives.compare l) - *) - - let rec add_gen i g = match g() with - | None -> i - | Some (arr,v) -> add_gen (add i arr v) g - - let of_gen g = add_gen empty g - - let to_gen s = - let st = Stack.create () in - Stack.push ([],s) st; - let rec next () = - if Stack.is_empty st then None - else - let trail, Node (opt, m) = Stack.pop st in - (* explore children *) - M.iter - (fun c node' -> Stack.push (c::trail, node') st) - m; - match opt with - | None -> next() - | Some v -> - let str = Str.of_list (List.rev trail) in - Some (str,v) - in - next - - (*$Q - list_uniq_ (fun l -> \ - Gen.of_list l |> Index.of_gen |> Index.to_gen \ - |> Gen.to_list |> List.sort Pervasives.compare \ - = List.sort Pervasives.compare l) - *) - - let to_klist idx = - let rec traverse node trail ~(fk:(string_*'a) klist) () = - let Node (opt, m) = node in - (* all alternatives: continue exploring [m], or call [fk] *) - let fk = - M.fold - (fun c node' fk -> traverse node' (c::trail) ~fk) - m fk - in - match opt with - | Some v -> - let str = Str.of_list (List.rev trail) in - `Cons ((str,v), fk) - | _ -> fk () (* fail... or explore subtrees *) - in - traverse idx [] ~fk:(fun () -> `Nil) - end -end - -include Make(struct - type t = string - type char_ = char - let compare_char = Char.compare - let length = String.length - let get = String.get - let of_list l = - let buf = Buffer.create (List.length l) in - List.iter (fun c -> Buffer.add_char buf c) l; - Buffer.contents buf -end) - -let debug_print = debug_print output_char - -(*$T - edit_distance "foo" "fo0" = 1 - edit_distance "foob" "foo" = 1 - edit_distance "yolo" "yoyo" = 1 - edit_distance "aaaaaaab" "aaaa" = 4 -*) - -(* -open Batteries;; -let words = File.with_file_in "/usr/share/dict/cracklib-small" (fun i -> IO.read_all i |> String.nsplit ~by:"\\n");; -let idx = List.fold_left (fun idx s -> Levenshtein.StrIndex.add_string idx s s) Levenshtein.StrIndex.empty words;; -Levenshtein.StrIndex.retrieve_string ~limit:1 idx "hell" |> Levenshtein.klist_to_list;; -*) diff --git a/src/string/CCLevenshtein.mli b/src/string/CCLevenshtein.mli deleted file mode 100644 index c89bd59f..00000000 --- a/src/string/CCLevenshtein.mli +++ /dev/null @@ -1,200 +0,0 @@ -(* -copyright (c) 2013, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - - -(** {1 Levenshtein distance} - -We take inspiration from -http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata -for the main algorithm and ideas. However some parts are adapted *) - -type 'a sequence = ('a -> unit) -> unit -type 'a gen = unit -> 'a option - -(** {2 Abstraction over Strings} -Due to the existence of several encodings and string representations we -abstract over the type of strings. A string is a finite array of characters -(8-bits char, unicode runes, etc.) which provides a length operation -and a function to access the n-th character. *) - -module type STRING = sig - type char_ - type t - - val of_list : char_ list -> t - val get : t -> int -> char_ - val length : t -> int - val compare_char : char_ -> char_ -> int -end - -(** {2 Continuation list} - -This data structure is used to represent a list of result that is -evaluated only as far as the user wants. If the user only wants a few elements, -she doesn't pay for the remaining ones. - -In particular, when matching a string against a (big) set of indexed -strings, we return a continuation list so that, even if there are many results, -only those actually asked for are evaluated. *) - -type 'a klist = - unit -> [ - | `Nil - | `Cons of 'a * 'a klist - ] - -val klist_to_list : 'a klist -> 'a list - (** Helper for short lists. *) - -(** {2 Signature} - -The signature for a given string representation provides 3 main things: - -- a [edit_distance] function to compute the edit distance between strings -- an [automaton] type that is built from a string [s] and a maximum distance [n], - and only accepts the strings [s'] such that [edit_distance s s' <= n]. -- an [Index] module that can be used to map many strings to values, like - a regular string map, but for which retrieval is fuzzy (for a given - maximal distance). - -A possible use of the index could be: -{[ - -let words = CCIO.with_in "/usr/share/dict/words" - (fun i -> CCIO.read_all i |> CCString.Split.list_cpy ~by:"\n");; - -let words = List.map (fun s->s,s) words;; -let idx = CCLevenshtein.Index.of_list words;; - -CCLevenshtein.Index.retrieve ~limit:1 idx "hell" |> CCLevenshtein.klist_to_list;; -]} -*) - -module type S = sig - type char_ - type string_ - - (** {6 Edit Distance} *) - - val edit_distance : string_ -> string_ -> int - (** Edition distance between two strings. This satisfies the classical - distance axioms: it is always positive, symmetric, and satisfies - the formula [distance a b + distance b c >= distance a c] *) - - (** {6 Automaton} - An automaton, built from a string [s] and a limit [n], that accepts - every string that is at distance at most [n] from [s]. *) - - type automaton - (** Levenshtein automaton *) - - val of_string : limit:int -> string_ -> automaton - (** Build an automaton from a string, with a maximal distance [limit]. - The automaton will accept strings whose {!edit_distance} to the - parameter is at most [limit]. *) - - val of_list : limit:int -> char_ list -> automaton - (** Build an automaton from a list, with a maximal distance [limit] *) - - val debug_print : (out_channel -> char_ -> unit) -> - out_channel -> automaton -> unit - (** Output the automaton's structure on the given channel. *) - - val match_with : automaton -> string_ -> bool - (** [match_with a s] matches the string [s] against [a], and returns - [true] if the distance from [s] to the word represented by [a] is smaller - than the limit used to build [a] *) - - (** {6 Index for one-to-many matching} *) - - module Index : sig - type 'b t - (** Index that maps strings to values of type 'b. Internally it is - based on a trie. A string can only map to one value. *) - - val empty : 'b t - (** Empty index *) - - val is_empty : _ t -> bool - - val add : 'b t -> string_ -> 'b -> 'b t - (** Add a pair string/value to the index. If a value was already present - for this string it is replaced. *) - - val cardinal : _ t -> int - (** Number of bindings *) - - val remove : 'b t -> string_ -> 'b t - (** Remove a string (and its associated value, if any) from the index. *) - - val retrieve : limit:int -> 'b t -> string_ -> 'b klist - (** Lazy list of objects associated to strings close to the query string *) - - val of_list : (string_ * 'b) list -> 'b t - (** Build an index from a list of pairs of strings and values *) - - val to_list : 'b t -> (string_ * 'b) list - (** Extract a list of pairs from an index *) - - val add_seq : 'a t -> (string_ * 'a) sequence -> 'a t - (** @since 0.14 *) - - val of_seq : (string_ * 'a) sequence -> 'a t - (** @since 0.14 *) - - val to_seq : 'a t -> (string_ * 'a) sequence - (** @since 0.14 *) - - val add_gen : 'a t -> (string_ * 'a) gen -> 'a t - (** @since 0.14 *) - - val of_gen : (string_ * 'a) gen -> 'a t - (** @since 0.14 *) - - val to_gen : 'a t -> (string_ * 'a) gen - (** @since 0.14 *) - - val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a - (** Fold over the stored pairs string/value *) - - val iter : (string_ -> 'b -> unit) -> 'b t -> unit - (** Iterate on the pairs *) - - val to_klist : 'b t -> (string_ * 'b) klist - (** Conversion to an iterator *) - end -end - -(** {2 Functor} *) - -module Make(Str : STRING) : S - with type string_ = Str.t - and type char_ = Str.char_ - -(** {2 Default instance: string} *) - -include S with type char_ = char and type string_ = string - -val debug_print : out_channel -> automaton -> unit diff --git a/src/string/containers_string.ml b/src/string/containers_string.ml deleted file mode 100644 index b2e335ac..00000000 --- a/src/string/containers_string.ml +++ /dev/null @@ -1,29 +0,0 @@ -(* -copyright (c) 2013-2015, simon cruanes -all rights reserved. - -redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -redistributions of source code must retain the above copyright notice, this -list of conditions and the following disclaimer. redistributions in binary -form must reproduce the above copyright notice, this list of conditions and the -following disclaimer in the documentation and/or other materials provided with -the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*) - -module App_parse = CCApp_parse -module Parse = CCParse -module KMP = CCKMP -module Levenshtein = CCLevenshtein