remove containers.string

This commit is contained in:
Simon Cruanes 2016-11-03 20:27:34 +01:00
parent bd7a9ce070
commit c3e6e798e6
8 changed files with 4 additions and 2196 deletions

View file

@ -22,11 +22,6 @@ cross-module dependencies).
containers.data:: with additional data structures that don't have an containers.data:: with additional data structures that don't have an
equivalent in the standard library; equivalent in the standard library;
containers.iter:: with list-like and tree-like iterators; containers.iter:: with list-like and tree-like iterators;
containers.string:: (in directory `string`) with
a few packed modules that deal with strings (Levenshtein distance,
KMP search algorithm, and a few naive utils). Again, modules are independent
and sometimes parametric on the string and char types (so they should
be able to deal with your favorite unicode library).
- A sub-library with complicated abstractions, `containers.advanced` (with - A sub-library with complicated abstractions, `containers.advanced` (with
a LINQ-like query module, batch operations using GADTs, and others). a LINQ-like query module, batch operations using GADTs, and others).
@ -191,14 +186,6 @@ Iterators:
- `CCKList`, a persistent iterator structure (akin to a lazy list, without memoization) - `CCKList`, a persistent iterator structure (akin to a lazy list, without memoization)
- `CCKTree`, an abstract lazy tree structure - `CCKTree`, an abstract lazy tree structure
=== String
See http://cedeela.fr/~simon/software/containers/Containers_string[doc].
In the module `Containers_string`:
- `Levenshtein`: edition distance between two strings
- `KMP`: Knuth-Morris-Pratt substring algorithm
- `Parse`: simple parser combinators
=== Thread === Thread

18
_oasis
View file

@ -78,13 +78,6 @@ Library "containers_iter"
FindlibParent: containers FindlibParent: containers
FindlibName: iter FindlibName: iter
Library "containers_string"
Path: src/string
Modules: Containers_string, CCLevenshtein, CCApp_parse
BuildDepends: bytes
FindlibName: string
FindlibParent: containers
Library "containers_thread" Library "containers_thread"
Path: src/threads/ Path: src/threads/
Modules: CCPool, CCLock, CCSemaphore, CCThread, CCBlockingQueue, Modules: CCPool, CCLock, CCSemaphore, CCThread, CCBlockingQueue,
@ -102,7 +95,6 @@ Library "containers_top"
FindlibName: top FindlibName: top
FindlibParent: containers FindlibParent: containers
BuildDepends: compiler-libs.common, containers, containers.data, BuildDepends: compiler-libs.common, containers, containers.data,
containers.bigarray, containers.string,
containers.unix, containers.sexp, containers.iter containers.unix, containers.sexp, containers.iter
Document containers Document containers
@ -116,8 +108,7 @@ Document containers
"-docflags '-colorize-code -short-functors -charset utf-8'" "-docflags '-colorize-code -short-functors -charset utf-8'"
XOCamlbuildLibraries: XOCamlbuildLibraries:
containers, containers.iter, containers.data, containers, containers.iter, containers.data,
containers.string, containers.thread, containers.thread, containers.unix, containers.sexp
containers.unix, containers.sexp
Executable run_benchs Executable run_benchs
Path: benchs/ Path: benchs/
@ -126,7 +117,7 @@ Executable run_benchs
Build$: flag(bench) Build$: flag(bench)
MainIs: run_benchs.ml MainIs: run_benchs.ml
BuildDepends: containers, qcheck, BuildDepends: containers, qcheck,
containers.data, containers.string, containers.iter, containers.data, containers.iter,
containers.thread, sequence, gen, benchmark, hamt containers.thread, sequence, gen, benchmark, hamt
Executable run_bench_hash Executable run_bench_hash
@ -145,9 +136,8 @@ Executable run_qtest
CompiledObject: best CompiledObject: best
MainIs: run_qtest.ml MainIs: run_qtest.ml
Build$: flag(tests) && flag(bigarray) && flag(unix) Build$: flag(tests) && flag(bigarray) && flag(unix)
BuildDepends: containers, containers.string, containers.iter, BuildDepends: containers, containers.iter,
containers.sexp, containers.sexp, containers.unix, containers.thread,
containers.unix, containers.thread,
containers.data, containers.data,
sequence, gen, unix, oUnit, qcheck sequence, gen, unix, oUnit, qcheck

View file

@ -120,17 +120,6 @@ CCKList
CCKTree CCKTree
CCLazy_list} CCLazy_list}
{4 String}
{b findlib name}: containers.string
{!modules:
CCApp_parse
CCKMP
CCLevenshtein
CCParse
}
{4 Misc} {4 Misc}
Moved to its own repository. Moved to its own repository.

View file

@ -1,834 +0,0 @@
(*
copyright (c) 2013-2015, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Applicative Parser Combinators} *)
type ('a,'b) result = [`Error of 'b | `Ok of 'a]
type multiplicity =
| Star (* 0 or more *)
| Plus (* 1 or more *)
| Question (* 0 or 1 *)
let str fmt = Printf.sprintf fmt
module CharSet = Set.Make(Char)
module CharMap = Map.Make(Char)
let print_char = function
| '\t' -> "\\t"
| '\n' -> "\\n"
| '\r' -> "\\r"
| '"' -> "\\\""
| c -> str "%c" c
let print_char_set set =
let buf = Buffer.create 32 in
Buffer.add_char buf '"';
CharSet.iter (fun c -> Buffer.add_string buf (print_char c)) set;
Buffer.add_char buf '"';
Buffer.contents buf
let domain_of_char_map m =
CharMap.fold (fun c _ set -> CharSet.add c set) m CharSet.empty
let print_char_map map =
let l = CharMap.fold
(fun c _ acc -> print_char c :: acc) map [] in
String.concat ", " l
let ppmap ?(sep=", ") pp_k pp_v fmt m =
let first = ref true in
CharMap.iter
(fun k v ->
if !first then first := false else Format.pp_print_string fmt sep;
pp_k fmt k;
Format.pp_print_string fmt "";
pp_v fmt v;
Format.pp_print_cut fmt ()
) m;
()
let set_of_string s =
let set = ref CharSet.empty in
String.iter
(fun c ->
if CharSet.mem c !set
then invalid_arg (str "any_of: duplicate char %c" c);
set := CharSet.add c !set
) s;
!set
(* add [c -> p] to the map, for every [c] in [set] *)
let map_add_set init set p =
CharSet.fold
(fun c map -> CharMap.add c p map)
set init
(* function composition *)
let compose f g x = f (g x)
let str_of_l l =
let b = Bytes.make (List.length l) ' ' in
List.iteri (fun i c -> Bytes.set b i c) l;
Bytes.unsafe_to_string b
type 'a t = {
mutable st : 'a parse_or_compiled;
}
(* syntactic version *)
and _ parse =
| Many : 'a t * unit t * multiplicity -> 'a list parse
| Skip : 'a t * multiplicity -> unit parse (* same as Many, but ignores *)
| Lazy : 'a t lazy_t -> 'a parse
(* compiled version *)
and _ compiled =
| C_Return : 'a -> 'a compiled
| C_Map : ('a -> 'b) * 'a t -> 'b compiled
| C_Filter: ('a -> bool) * 'a t -> 'a compiled
| C_App : ('a -> 'b) t * 'a t -> 'b compiled
| C_AppLeft : 'a t * 'b t -> 'a compiled
| C_AppRight : 'a t * 'b t -> 'b compiled
| C_Fail : string -> 'a compiled
| C_Int : int compiled
| C_Float : float compiled
| C_Junk : unit compiled (* ignore next char *)
| C_AnyOf : CharSet.t -> char compiled
| C_SwitchC : 'a t CharMap.t * 'a t option -> 'a compiled
| C_Eof : unit compiled
and 'a parse_or_compiled =
| Parse of 'a parse
| Compiled of 'a compiled
(** {2 Helpers} *)
(* build a new parser *)
let make p = {st=Parse p}
let make_c c = {st=Compiled c}
let make_pc st = {st}
let ppmult fmt = function
| Star -> Format.pp_print_string fmt "*"
| Plus -> Format.pp_print_string fmt "+"
| Question -> Format.pp_print_string fmt "?"
let print fmt p =
let depth = ref 0 in
(* print up to a given limit into lazy values *)
let rec print_aux
: type a. Format.formatter -> a t -> unit
= fun fmt p ->
let ppstr = Format.pp_print_string
and ppf fmt x = Format.fprintf fmt x in
let ppc fmt c = ppf fmt "'%s'" (print_char c) in
match p.st with
| Compiled (C_Return _) -> ppstr fmt "<ret>"
| Compiled (C_Map (_, x)) -> ppf fmt "@[(map@ %a)@]" print_aux x
| Compiled (C_Filter (_, x)) -> ppf fmt "@[(filter@ %a)@]" print_aux x
| Compiled (C_App (f, x)) -> ppf fmt "@[<2>@[%a@]@ <*>@ @[%a@]@]" print_aux f print_aux x
| Compiled (C_AppLeft (a, b)) -> ppf fmt "@[%a@ <<@ %a@]" print_aux a print_aux b
| Compiled (C_AppRight (a, b)) -> ppf fmt "@[%a@ >>@ %a@]" print_aux a print_aux b
| Compiled (C_Fail _) -> ppf fmt "<fail>"
| Compiled C_Int -> ppstr fmt "<int>"
| Compiled C_Float -> ppstr fmt "<float>"
| Compiled C_Junk -> ppstr fmt "<junk>"
| Compiled (C_AnyOf set) -> ppf fmt "@[(any@ %s)@]" (print_char_set set)
| Parse (Many (p, sep, mult)) ->
ppf fmt "@[<2>(@[%a@]@ sep:@[%a@])%a@]" print_aux p print_aux sep ppmult mult
| Parse (Skip (p, mult)) ->
ppf fmt "@[<2>(skip @[%a@]%a)@]" print_aux p ppmult mult
| Compiled (C_SwitchC (map, None)) ->
ppf fmt "@[<hv2>(switch@ @[%a@])@]" (ppmap ppc print_aux) map
| Compiled (C_SwitchC (map, Some o)) ->
ppf fmt "@[<hv2>(switch@ @[%a@]@ or:%a)@]" (ppmap ppc print_aux) map print_aux o
| Parse (Lazy _) when !depth > 3 -> ppf fmt "<lazy>"
| Parse (Lazy (lazy p)) ->
incr depth;
print_aux fmt p;
decr depth
| Compiled C_Eof -> ppstr fmt "<eof>"
in
print_aux fmt p
let int_first_char = lazy (set_of_string "-0123456789")
let float_first_char = lazy (set_of_string ".-0123456789")
(* a set of characters that are valid as first characters of a parser *)
type possible_first_chars =
| Set of CharSet.t
| AllChars
| NoChar
| NoCharOrSet of CharSet.t (* either no char, or something starting with set *)
| IsFail of string
let ret_set set = match CharSet.cardinal set with
| 0 -> NoChar
| 256 -> AllChars
| _ -> Set set
let ret_no_char_or set = match CharSet.cardinal set with
| 0 -> NoChar
| 256 -> AllChars
| _ -> NoCharOrSet set
(* pfc of parsing a or b *)
let union_pfc a b = match a, b with
| Set a, Set b -> ret_set (CharSet.union a b)
| NoCharOrSet s, Set s'
| Set s', NoCharOrSet s -> ret_no_char_or (CharSet.union s s')
| NoChar, Set s
| Set s, NoChar -> ret_no_char_or s
| NoCharOrSet s, NoCharOrSet s' -> ret_no_char_or (CharSet.union s s')
| IsFail e, _ | _, IsFail e -> IsFail e
| AllChars, _ | _, AllChars -> AllChars
| NoChar, o | o, NoChar -> o
(* pfc of parsing a then b *)
let then_pfc a b = match a, b with
| Set a, Set b -> ret_set (CharSet.union a b)
| NoCharOrSet s, NoCharOrSet s' -> ret_no_char_or (CharSet.union s s')
| NoCharOrSet s, Set s' -> ret_set (CharSet.union s s')
| NoCharOrSet s, NoChar -> ret_no_char_or s
| Set s, _ -> ret_set s
| IsFail e, _ | _, IsFail e -> IsFail e
| AllChars, _ | _, AllChars -> AllChars
| NoChar, o -> o
let (<|||>) a b = match a with
| NoChar -> Lazy.force b
| NoCharOrSet _ -> then_pfc a (Lazy.force b)
| _ -> a
(* set of possibilities for the first char of a parser *)
let rec pfc : type a. a t -> possible_first_chars = fun t -> pfc_pc t.st
and pfc_pc
: type a. a parse_or_compiled -> possible_first_chars
= function
| Parse p -> pfc_p p
| Compiled c -> pfc_c c
and pfc_p
: type a. a parse -> possible_first_chars
= function
| Many (p, _, (Question | Star)) -> union_pfc (pfc p) NoChar
| Many (p, _, Plus) -> pfc p
| Skip (p, (Question | Star)) -> union_pfc (pfc p) NoChar
| Skip (p, Plus) -> pfc p
| Lazy (lazy p) -> pfc p
and pfc_c
: type a. a compiled -> possible_first_chars
= function
| C_Return _ -> NoChar
| C_Map (_, x) -> pfc x
| C_Filter (_, x) -> pfc x
| C_App (f, x) -> pfc f <|||> lazy (pfc x)
| C_AppLeft (a, b) -> pfc a <|||> lazy (pfc b)
| C_AppRight (a, b) -> pfc a <|||> lazy (pfc b)
| C_Fail e -> IsFail e
| C_Int -> Set (Lazy.force int_first_char)
| C_Float -> Set (Lazy.force float_first_char)
| C_Junk -> AllChars
| C_AnyOf set -> ret_set set
| C_SwitchC (map, None) -> ret_set (domain_of_char_map map)
| C_SwitchC (map, Some o) ->
let s = domain_of_char_map map in
union_pfc (ret_set s) (pfc o)
| C_Eof -> NoChar
let possible_first_chars = pfc
(** {2 Combinators} *)
let return x = make_c (C_Return x)
let pure = return
let success = pure ()
let fail msg = make_c (C_Fail msg)
let junk = make_c C_Junk
let failf fmt = Printf.ksprintf (fun msg -> fail msg) fmt
let map f x = match x.st with
| Compiled (C_Map (g, y)) -> make_c (C_Map (compose f g, y))
| Compiled (C_Return x) -> pure (f x)
| _ -> make_c (C_Map (f, x))
let app f x = match f.st with
| Compiled (C_Return f) -> map f x
| _ -> make_c (C_App (f, x))
let fun_and f f' x = f x && f' x
let filter f x = match x.st with
| Compiled (C_Return y) -> if f y then return y else fail "filter failed"
| Compiled (C_Filter (f', y)) -> make_c (C_Filter (fun_and f f', y))
| _ -> make_c (C_Filter (f, x))
let app_left a b = make_c (C_AppLeft (a, b)) (* return (fun x y -> x) <*> a <*> b *)
let app_right a b = make_c (C_AppRight (a, b)) (* return (fun x y -> y) <*> a <*> b *)
let int = make_c C_Int
let float = make_c C_Float
let many ?(sep=success) p = make (Many (p, sep, Star))
let many1 ?(sep=success) p = make (Many (p, sep, Plus))
let skip p = make (Skip (p, Star))
let skip1 p = make (Skip (p, Plus))
let opt p =
map
(function
| [x] -> Some x
| [] -> None
| _ -> assert false
) (make (Many (p, success, Question)))
let any_of' s = make_c (C_AnyOf s)
let any_of s = any_of' (set_of_string s)
let char c = any_of' (CharSet.singleton c)
let spaces = skip (any_of " \t")
let spaces1 = skip1 (any_of " \t")
let white = skip (any_of " \t\n")
let white1 = skip1 (any_of " \t\n")
let alpha_lower_ = set_of_string "abcdefghijklmonpqrstuvwxyz"
let alpha_upper_ = set_of_string "ABCDEFGHIJKLMONPQRSTUVWXYZ"
let num_ = set_of_string "0123456789"
let alpha_ = CharSet.union alpha_lower_ alpha_upper_
let symbols_ = set_of_string "|!;$#@%&-_/="
let alpha_lower = any_of' alpha_lower_
let alpha_upper = any_of' alpha_upper_
let num = any_of' num_
let symbols = any_of' symbols_
let alpha = any_of' alpha_
let alpha_num = any_of' (CharSet.union num_ alpha_)
let eof = make_c C_Eof
let switch_c ?default l =
if l = [] then match default with
| None -> invalid_arg "switch_c: empty list";
| Some d -> d
else
let map = List.fold_left
(fun map (c, t) ->
if CharMap.mem c map
then invalid_arg (str "switch_c: duplicate char %c" c);
CharMap.add c t map
) CharMap.empty l
in
make_c (C_SwitchC (map, default))
exception ExnIsFail of string
let make_switch_c a b = make_c (C_SwitchC (a, b))
(* binary choice: compiled into decision tree *)
let rec merge a b =
(* build a switch by first char *)
try
begin match a.st, b.st with
| Compiled (C_SwitchC (map_a, def_a)),
Compiled (C_SwitchC (map_b, def_b)) ->
(* merge jump tables *)
let def = match def_a, def_b with
| None, None -> None
| Some d, None
| None, Some d -> Some d
| Some _, Some _ ->
invalid_arg "choice: ambiguous, several parsers accept any input"
in
let map = CharMap.merge
(fun _ a b -> match a, b with
| Some a', Some b' -> Some (merge a' b')
| Some m, None
| None, Some m -> Some m
| None, None -> assert false
) map_a map_b
in
make_switch_c map def
| Compiled (C_SwitchC (map, def)), other
| other, Compiled (C_SwitchC (map, def)) ->
let map', def' = match pfc_pc other, def with
| AllChars, _ ->
invalid_arg "choice: ambiguous, several parsers accept any input"
| NoChar, None -> map, Some (make_pc other)
| NoChar, Some _ ->
invalid_arg "choice: ambiguous"
| IsFail msg, _ -> raise (ExnIsFail msg)
| NoCharOrSet set, def
| Set set, def ->
if CharSet.exists (fun c -> CharMap.mem c map) set
then invalid_arg
(str "choice: ambiguous parsers (overlap on {%s})"
(print_char_set (CharSet.inter set (domain_of_char_map map))));
(* else: merge jump tables *)
let map = map_add_set map set (make_pc other) in
map, def
in
make_switch_c map' def'
| _ ->
begin match possible_first_chars a, possible_first_chars b with
| (Set set1 | NoCharOrSet set1), (Set set2 | NoCharOrSet set2) ->
if CharSet.exists (fun c -> CharSet.mem c set2) set1
then invalid_arg
(str "choice: ambiguous parsers (overlap on {%s})"
(print_char_set (CharSet.inter set1 set2)));
let map = map_add_set CharMap.empty set1 a in
let map = map_add_set map set2 b in
make_switch_c map None
| IsFail e, _ | _, IsFail e -> raise (ExnIsFail e)
| Set s, NoChar -> make_switch_c (map_add_set CharMap.empty s a) (Some b)
| NoChar, Set s -> make_switch_c (map_add_set CharMap.empty s b) (Some a)
| AllChars, _ | _, AllChars ->
invalid_arg "choice: ambiguous parsers (one accepts everything)"
| (NoChar | NoCharOrSet _), (NoChar | NoCharOrSet _) ->
invalid_arg "choice: ambiguous parsers (both accept nothing)"
end
end
with ExnIsFail msg -> make_c (C_Fail msg)
let rec choice = function
| [] -> invalid_arg "choice: empty list";
| [x] -> x
| a :: tl -> merge a (choice tl)
(* temporary structure for buildings switches *)
type 'a trie =
| TrieLeaf of 'a t
| TrieNode of 'a trie CharMap.t
let trie_empty = TrieNode CharMap.empty
let rec parser_of_trie : type a. a trie -> a t = function
| TrieLeaf p -> p
| TrieNode m ->
make_switch_c (CharMap.map parser_of_trie' m) None
(* consume next char, then build sub-trie *)
and parser_of_trie'
: type a. a trie -> a t
= fun x -> app_right junk (parser_of_trie x)
(* build prefix trie *)
let switch_s l =
if l = [] then invalid_arg "switch_s: empty list";
(* add parser p in trie [t], with key slice of [s] starting at [i] *)
let rec add_trie t s i p =
if i = String.length s
then match t with
| TrieNode m when CharMap.is_empty m -> TrieLeaf p
| TrieNode _ -> invalid_arg (str "key \"%s\" is prefix of another key" s)
| TrieLeaf _ -> invalid_arg (str "duplicate key \"%s\"" s)
else
let c = String.get s i in
match t with
| TrieLeaf _ ->
invalid_arg (str "key \"%s\" is prefixed by another key" s)
| TrieNode map ->
try
let sub = CharMap.find c map in
let sub = add_trie sub s (i+1) p in
TrieNode (CharMap.add c sub map)
with Not_found ->
let sub = add_trie trie_empty s (i+1) p in
TrieNode (CharMap.add c sub map)
in
let trie =
List.fold_left
(fun trie (s, p) ->
if s = "" then invalid_arg "switch_s: empty string";
add_trie trie s 0 p
) trie_empty l
in
parser_of_trie trie
let bool =
switch_s
[ "true", return true
; "false", return false
]
let fix f =
(* outermost lazy needed for the recursive definition *)
let rec r = {
st=Parse (Lazy (lazy (f r)));
} in
r
module Infix = struct
let (>|=) x f = map f x
let (<*>) = app
let (<<) = app_left
let (>>) = app_right
let (<+>) a b = choice [a; b]
let (<::>) a b = pure (fun x l -> x::l) <*> a <*> b
end
include Infix
let word =
pure (fun c s -> str_of_l (c :: s)) <*> alpha <*> many alpha_num
let quoted =
let q = char '"' in
let escaped = char '\\' >> char '"' in
let inner = choice [escaped; alpha_num; any_of "()' \t\n|!;$#@%&-_/=~.,:<>[]"] in
q >> (many inner >|= str_of_l) << q
(** {2 Compilation} *)
let encode_cons x sep tl = pure (fun x _sep tl -> x :: tl) <*> x <*> sep <*> tl
let encode_many
: type a. set:CharSet.t -> p:a t -> self:a list t -> sep:unit t -> a list t
= fun ~set ~p ~self ~sep ->
let on_success = encode_cons p sep self
and on_fail = pure [] in
make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail)
let encode_opt ~set x =
let mk_one x = [x] in
let on_success = make_c (C_Map (mk_one, x))
and on_fail = pure [] in
make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail)
let encode_skip
: type a. set:CharSet.t -> p:a t -> self:unit t -> unit t
= fun ~set ~p ~self ->
let on_success = p >> self
and on_fail = pure () in
make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail)
let many_
: type a. sep:unit t -> mult:multiplicity -> p:a t -> a list t
= fun ~sep ~mult ~p -> match possible_first_chars p with
| Set set ->
begin match mult with
| Star -> fix (fun self -> encode_many ~set ~sep ~p ~self)
| Plus -> encode_cons p sep (fix (fun self -> encode_many ~set ~sep ~p ~self))
| Question -> encode_opt ~set p
end
| IsFail msg -> fail msg
| NoCharOrSet _ -> invalid_arg (str "many: invalid parser (might not consume input)")
| AllChars -> invalid_arg (str "many: invalid parser (always succeeds)")
| NoChar -> invalid_arg (str "many: invalid parser (does not consume input)")
let skip_ : type a. mult:multiplicity -> p:a t -> unit t
= fun ~mult ~p -> match possible_first_chars p with
| Set set ->
begin match mult with
| Star -> fix (fun self -> encode_skip ~set ~p ~self)
| Plus -> p >> fix (fun self -> encode_skip ~set ~p ~self)
| Question -> encode_opt ~set p >> pure ()
end
| IsFail msg -> fail msg
| NoCharOrSet _ -> invalid_arg (str "many: invalid parser (might not consume input)")
| AllChars -> invalid_arg (str "skip: invalid parser (always succeeds)")
| NoChar -> invalid_arg (str "skip: invalid parser (does not consume input)")
let rec compile
: type a. a t -> a compiled
= fun t -> match t.st with
| Compiled c -> c (* already compiled *)
| Parse (Many (p, sep, mult)) ->
let c = compile (many_ ~sep ~mult ~p) in
t.st <- Compiled c;
c
| Parse (Skip (p, mult)) ->
let c = compile (skip_ ~mult ~p) in
t.st <- Compiled c;
c
| Parse (Lazy (lazy p)) ->
let c = compile p in
t.st <- Compiled c;
c
(** {2 Signatures} *)
type error = {
line: int;
col: int;
msg: string;
}
let string_of_error e = str "at %d:%d; %s" e.line e.col e.msg
exception Error of error
module type S = sig
type source
(** Source of characters *)
val parse : source -> 'a t -> ('a, error) result
(** Parse the given source using the parser, and returns the parsed value. *)
val parse': source -> 'a t -> ('a, string) result
(** Same as {!parse}, but returns a user-friendly string in case of failure *)
val parse_exn : source -> 'a t -> 'a
(** Unsafe version of {!parse}.
@raise Error if parsing fails *)
end
(** {2 Build a parser from a given Monadic Input} *)
module type INPUT = sig
type t
val read : t -> Bytes.t -> int -> int -> int
end
type token =
| Yield of char
| EOF
module type READER = sig
type t
type source
val create : source -> t
val peek : t -> token (* peek; do not consume *)
val next : t -> token (* read and consume *)
val junk : t -> unit (* consume last token, obtained with junk *)
val line : t -> int
val col : t -> int
end
module ReaderOfInput(I : INPUT) : READER with type source = I.t = struct
type t = {
mutable rline : int;
mutable rcol : int;
input : I.t;
buf : Bytes.t;
mutable i : int;
mutable len : int;
}
type source = I.t
let line t = t.rline
let col t = t.rcol
let create input = {
rline=1;
rcol=0;
input;
buf = Bytes.make 1024 ' ';
i=1;
len=1; (* trick for initialization *)
}
let read_next t =
let c = Bytes.get t.buf t.i in
t.i <- t.i + 1;
if c = '\n' then (
t.rcol <- 0;
t.rline <- t.rline + 1;
) else (
t.rcol <- t.rcol + 1
);
Yield c
let refill t =
t.len <- I.read t.input t.buf 0 (Bytes.length t.buf);
t.i <- 0;
()
let next t =
if t.len = 0 then EOF
else if t.i = t.len
then (
refill t;
if t.len = 0 then EOF else read_next t
) else read_next t
let peek t =
if t.i = t.len
then refill t;
Yield (Bytes.get t.buf t.i)
let junk t =
assert (t.len > 0 && t.i < t.len);
t.i <- t.i + 1
end
module MakeFromReader(R : READER) : S with type source = R.source = struct
type source = R.source
let error r msg =
raise (Error {
line = R.line r;
col = R.col r;
msg;
})
let errorf r fmt =
Printf.ksprintf
(fun msg -> error r msg)
fmt
let is_int c = Char.code c >= Char.code '0' && Char.code c <= Char.code '9'
let to_int c = Char.code c - Char.code '0'
let rec parse_int r ~sign i = match R.peek r with
| EOF -> i
| Yield c when is_int c ->
R.junk r;
parse_int r ~sign (10 * i + to_int c)
| Yield '-' when i = 0 && sign ->
(* switch sign: only on first char *)
R.junk r;
parse_int r ~sign:false 0
| _ -> if sign then i else -i
let parse_float _r _buf = assert false
let rec parse_rec : type a. R.t -> a t -> a =
fun r p -> match compile p with
| C_Return x -> x
| C_Map (f, x) ->
let y = parse_rec r x in
f y
| C_Filter (f, x) ->
let y = parse_rec r x in
if f y then y else errorf r "filter failed"
| C_App (f, x) ->
let f' = parse_rec r f in
let x' = parse_rec r x in
f' x'
| C_AppLeft (a, b) ->
let a' = parse_rec r a in
let _ = parse_rec r b in
a'
| C_AppRight (a, b) ->
let _ = parse_rec r a in
let b' = parse_rec r b in
b'
| C_Fail msg -> error r msg
| C_Int -> parse_int r ~sign:true 0
| C_Float -> parse_float r (Buffer.create 8)
| C_Junk -> R.junk r
| C_AnyOf set ->
begin match R.next r with
| EOF -> errorf r "expected any of %s, got EOF" (print_char_set set)
| Yield c ->
if CharSet.mem c set then c
else errorf r "expected any of %s, got '%s'" (print_char_set set) (print_char c)
end
| C_SwitchC (map, def) ->
begin match R.peek r with
| EOF -> errorf r "expected any of %s, got EOF" (print_char_map map)
| Yield c ->
begin try
let p' = CharMap.find c map in
parse_rec r p'
with Not_found -> match def with
| None ->
errorf r "expected any of %s, got %c" (print_char_map map) c
| Some d -> parse_rec r d
end
end
| C_Eof ->
begin match R.next r with
| EOF -> ()
| Yield c -> errorf r "expected EOF, got %c" c
end
(* public functions *)
let parse_exn src p =
let r = R.create src in
parse_rec r p
let parse src p =
let r = R.create src in
try
`Ok (parse_rec r p)
with Error e ->
`Error e
let parse' src p = match parse src p with
| `Ok x -> `Ok x
| `Error e -> `Error (string_of_error e)
end
module Make(I : INPUT) = struct
module R = ReaderOfInput(I)
include MakeFromReader(R)
end
module Str = MakeFromReader(struct
(* reader of string *)
type t = {
str : string;
mutable i : int;
mutable rcol : int;
mutable rline : int;
}
type source = string
let create str = {
str;
i = 0;
rcol = 1;
rline = 1;
}
let line t = t.rline
let col t = t.rcol
let peek t =
if t.i = String.length t.str then EOF else Yield (String.get t.str t.i)
let junk t =
assert (t.i < String.length t.str);
t.i <- t.i + 1
let next t =
if t.i = String.length t.str then EOF
else (
let c = String.get t.str t.i in
t.i <- t.i + 1;
if c = '\n' then (
t.rcol <- 1;
t.rline <- t.rline + 1
) else t.rcol <- t.rcol + 1;
Yield c
)
end)
module Chan = Make(struct
type t = in_channel
let read = input
end)

View file

@ -1,272 +0,0 @@
(*
copyright (c) 2013-2015, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Applicative Parser Combinators}
Example: basic S-expr parser
{[
open Containers_string.App_parse;;
type sexp = Atom of string | List of sexp list;;
let mkatom a = Atom a;;
let mklist l = List l;;
let ident_char = alpha_num <+> any_of "|!;$#@%&-_/=*.:~+[]<>'" ;;
let ident = many1 ident_char >|= str_of_l ;;
let atom = (ident <+> quoted) >|= mkatom ;;
let sexp = fix (fun sexp ->
white >>
(atom <+>
((char '(' >> many sexp << char ')') >|= mklist)
)
);;
Str.parse_exn "(a (b c d) e)" sexp;;
]}
@deprecated CCParse is more expressive and stable
{b status: deprecated}
@since 0.10
*)
type ('a,'b) result = [`Error of 'b | `Ok of 'a]
type 'a t
(** Parser that yields an error or a value of type 'a *)
(** {6 Combinators} *)
val return : 'a -> 'a t
(** Parser that succeeds with the given value *)
val pure : 'a -> 'a t
(** Synonym to {!return} *)
val junk : unit t
(** Skip next char *)
val fail : string -> 'a t
(** [fail msg] fails with the given error message *)
val failf : ('a, unit, string, 'b t) format4 -> 'a
val app : ('a -> 'b) t -> 'a t -> 'b t
(** Applicative *)
val map : ('a -> 'b) -> 'a t -> 'b t
(** Map the parsed value *)
val int : int t
(** Parse an integer *)
val float : float t
(** Parse a floating point number *)
val bool : bool t
(** Parse "true" or "false" *)
val char : char -> char t
(** [char c] parses [c] and [c] only *)
val any_of : string -> char t
(** Parse any of the chars present in the given string *)
val alpha_lower : char t
val alpha_upper : char t
val alpha : char t
val symbols : char t
(** Symbols, such as "!-=_"... *)
val num : char t
val alpha_num : char t
val word : string t
(** [word] parses any identifier not starting with an integer and
not containing any whitespace nor delimiter
TODO: specify *)
val quoted : string t
(** Quoted string, following OCaml conventions *)
val str_of_l : char list -> string
(** Helper to build strings from lists of chars *)
val spaces : unit t
(** Parse a sequence of ['\t'] and [' '] *)
val spaces1 : unit t
(** Same as {!spaces} but requires at least one space *)
val white : unit t
(** Parse a sequence of ['\t'], ['\n'] and [' '] *)
val white1 : unit t
val eof : unit t
(** Matches the end of input, fails otherwise *)
val many : ?sep:unit t -> 'a t -> 'a list t
(** 0 or more parsed elements of the given type.
@param sep separator between elements of the list (for instance, {!space}) *)
val many1 : ?sep:unit t -> 'a t -> 'a list t
(** Same as {!many}, but needs at least one element *)
val skip : _ t -> unit t
(** Skip 0 or more instances of the given parser *)
val skip1 : _ t -> unit t
val opt : 'a t -> 'a option t
(** [opt x] tries to parse [x], and returns [None] otherwise *)
val filter : ('a -> bool) -> 'a t -> 'a t
(** [filter f p] parses the same as [p], but fails if the returned value
does not satisfy [f] *)
(* TODO: complement operator any_but (all but \, for instance) *)
(* TODO: a "if-then-else" combinator (assuming the test has a
set of possible first chars) *)
val switch_c : ?default:'a t -> (char * 'a t) list -> 'a t
(** [switch_c l] matches the next char and uses the corresponding parser.
Fails if the next char is not in the list, unless default is defined.
@param default parser to use if no char matches
@raise Invalid_argument if some char occurs several times in [l] *)
val switch_s : (string * 'a t) list -> 'a t
(** [switch_s l] attempts to match matches any of the strings in [l].
If one of those strings matches, the corresponding parser
is used from now on.
@raise Invalid_argument if some string is a prefix of another string,
or is empty, or if the list is empty *)
val choice : 'a t list -> 'a t
(** [choice l] chooses between the parsers, unambiguously
@raise Invalid_argument if the list is empty, or if some parsers
overlap, making the choice ambiguous *)
val fix : ('a t -> 'a t) -> 'a t
(** [fix f] makes a fixpoint *)
module Infix : sig
val (>|=) : 'a t -> ('a -> 'b) -> 'b t
(** Infix version of {!map} *)
val (<*>) : ('a -> 'b) t -> 'a t -> 'b t
(** Synonym to {!app} *)
val (>>) : _ t -> 'a t -> 'a t
(** [a >> b] parses [a], ignores its result, then parses [b] *)
val (<<) : 'a t -> _ t -> 'a t
(** [a << b] parses [a], then [b], and discards [b] to return [a] *)
val (<+>) : 'a t -> 'a t -> 'a t
(** [a <+> b] is [choice [a;b]], a binary choice *)
val (<::>) : 'a t -> 'a list t -> 'a list t
(** [a <::> b] is [app (fun x l -> x::l) a b] *)
end
include module type of Infix
(** {2 Signatures} *)
(** {6 Parsing} *)
type error = {
line: int;
col: int;
msg: string;
}
val string_of_error : error -> string
exception Error of error
module type S = sig
type source
(** Source of characters *)
val parse : source -> 'a t -> ('a, error) result
(** Parse the given source using the parser, and returns the parsed value. *)
val parse': source -> 'a t -> ('a, string) result
(** Same as {!parse}, but returns a user-friendly string in case of failure *)
val parse_exn : source -> 'a t -> 'a
(** Unsafe version of {!parse}.
@raise Error if parsing fails *)
end
(** {2 Parse} *)
module type INPUT = sig
type t
val read : t -> Bytes.t -> int -> int -> int
end
module Make(I : INPUT) : S with type source = I.t
(** {2 Low-level interface} *)
val print : Format.formatter -> _ t -> unit
(** Print a parser structure, for debug purpose *)
type token =
| Yield of char
| EOF
module type READER = sig
type t
type source (* underlying source *)
val create : source -> t
val peek : t -> token (* peek; do not consume *)
val next : t -> token (* read and consume *)
val junk : t -> unit (* consume last token, obtained with junk *)
val line : t -> int
val col : t -> int
end
module MakeFromReader(R : READER) : S with type source = R.source
(** {2 Defaults} *)
module Str : S with type source = string
module Chan : S with type source = in_channel

View file

@ -1,823 +0,0 @@
(*
copyright (c) 2013, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Levenshtein distance} *)
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
module type STRING = sig
type char_
type t
val of_list : char_ list -> t
val get : t -> int -> char_
val length : t -> int
val compare_char : char_ -> char_ -> int
end
(** Continuation list *)
type 'a klist = unit ->
[
| `Nil
| `Cons of 'a * 'a klist
]
let rec klist_to_list l = match l () with
| `Nil -> []
| `Cons (x,k) -> x :: klist_to_list k
(*$inject
open CCFun
let list_uniq_ = Q.(
let gen = Gen.(list_size (0 -- 100) (string_size ~gen:printable (1 -- 10))
>|= CCList.sort_uniq ~cmp:String.compare
>|= List.map (fun s->s,s)
) in
let print = Print.(list (pair string string)) in
let shrink = Shrink.(list ~shrink:(pair string string)) in
make ~small:List.length ~print ~shrink gen
)
*)
(*$Q
Q.(string_of_size Gen.(0 -- 30)) (fun s -> \
let a = of_string ~limit:1 s in \
match_with a s)
*)
(* test that building a from s, and mutating one char of s, yields
a string s' that is accepted by a.
--> generate triples (s, i, c) where c is a char, s a non empty string
and i a valid index in s
*)
(*$QR
(
let gen = Q.Gen.(
3 -- 10 >>= fun len ->
0 -- (len-1) >>= fun i ->
string_size (return len) >>= fun s ->
char >|= fun c -> (s,i,c)
) in
let small (s,_,_) = String.length s in
Q.make ~small gen
)
(fun (s,i,c) ->
let s' = Bytes.of_string s in
Bytes.set s' i c;
let a = of_string ~limit:1 s in
match_with a (Bytes.to_string s')
)
*)
(* test that, for an index, all retrieved strings are at a distance to
the key that is not too high *)
(*$QR & ~count:30
(
let mklist l =
let l' = List.map (fun s->s,s) l in
l, Index.of_list l'
in
let gen = Q.Gen.(
list_size (3 -- 15) (string_size (1 -- 10)) >|= mklist
) in
let small (l,_) = List.length l in
let print (l,_) = Q.Print.(list string) l in
let shrink (l,_) = Sequence.map mklist (Q.Shrink.list l) in
Q.make ~small ~print ~shrink gen
)
(fun (l,idx) ->
List.for_all
(fun s ->
let retrieved = Index.retrieve ~limit:2 idx s
|> klist_to_list in
List.for_all
(fun s' -> edit_distance s s' <= 2) retrieved &&
List.for_all
(fun s' -> not (edit_distance s s' <= 2) || List.mem s' retrieved)
l
) l
)
*)
(*$R
let idx = Index.of_list ["aa", "aa"; "ab", "ab"; "cd", "cd"; "a'c", "a'c"] in
assert_equal ~printer:Q.Print.(list string)
["a'c"; "aa"; "ab"]
(Index.retrieve ~limit:1 idx "ac" |> CCKList.to_list
|> List.sort Pervasives.compare)
*)
module type S = sig
type char_
type string_
(** {6 Edit Distance} *)
val edit_distance : string_ -> string_ -> int
(** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c] *)
(** {6 Automaton}
An automaton, built from a string [s] and a limit [n], that accepts
every string that is at distance at most [n] from [s]. *)
type automaton
(** Levenshtein automaton *)
val of_string : limit:int -> string_ -> automaton
(** Build an automaton from a string, with a maximal distance [limit].
The automaton will accept strings whose {!edit_distance} to the
parameter is at most [limit]. *)
val of_list : limit:int -> char_ list -> automaton
(** Build an automaton from a list, with a maximal distance [limit] *)
val debug_print : (out_channel -> char_ -> unit) ->
out_channel -> automaton -> unit
(** Output the automaton's structure on the given channel. *)
val match_with : automaton -> string_ -> bool
(** [match_with a s] matches the string [s] against [a], and returns
[true] if the distance from [s] to the word represented by [a] is smaller
than the limit used to build [a] *)
(** {6 Index for one-to-many matching} *)
module Index : sig
type 'b t
(** Index that maps strings to values of type 'b. Internally it is
based on a trie. A string can only map to one value. *)
val empty : 'b t
(** Empty index *)
val is_empty : _ t -> bool
val add : 'b t -> string_ -> 'b -> 'b t
(** Add a pair string/value to the index. If a value was already present
for this string it is replaced. *)
val cardinal : _ t -> int
(** Number of bindings *)
val remove : 'b t -> string_ -> 'b t
(** Remove a string (and its associated value, if any) from the index. *)
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
(** Lazy list of objects associated to strings close to the query string *)
val of_list : (string_ * 'b) list -> 'b t
(** Build an index from a list of pairs of strings and values *)
val to_list : 'b t -> (string_ * 'b) list
(** Extract a list of pairs from an index *)
val add_seq : 'a t -> (string_ * 'a) sequence -> 'a t
(** @since 0.14 *)
val of_seq : (string_ * 'a) sequence -> 'a t
(** @since 0.14 *)
val to_seq : 'a t -> (string_ * 'a) sequence
(** @since 0.14 *)
val add_gen : 'a t -> (string_ * 'a) gen -> 'a t
(** @since 0.14 *)
val of_gen : (string_ * 'a) gen -> 'a t
(** @since 0.14 *)
val to_gen : 'a t -> (string_ * 'a) gen
(** @since 0.14 *)
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
(** Fold over the stored pairs string/value *)
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
(** Iterate on the pairs *)
val to_klist : 'b t -> (string_ * 'b) klist
(** Conversion to an iterator *)
end
end
module Make(Str : STRING)
: S with type char_ = Str.char_ and type string_ = Str.t = struct
type string_ = Str.t
type char_ = Str.char_
let edit_distance s1 s2 =
if Str.length s1 = 0
then Str.length s2
else if Str.length s2 = 0
then Str.length s1
else if s1 = s2
then 0
else begin
(* distance vectors (v0=previous, v1=current) *)
let v0 = Array.make (Str.length s2 + 1) 0 in
let v1 = Array.make (Str.length s2 + 1) 0 in
(* initialize v0: v0(i) = A(0)(i) = delete i chars from t *)
for i = 0 to Str.length s2 do
v0.(i) <- i
done;
(* main loop for the bottom up dynamic algorithm *)
for i = 0 to Str.length s1 - 1 do
(* first edit distance is the deletion of i+1 elements from s *)
v1.(0) <- i+1;
(* try add/delete/replace operations *)
for j = 0 to Str.length s2 - 1 do
let cost = if Str.compare_char (Str.get s1 i) (Str.get s2 j) = 0 then 0 else 1 in
v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost));
done;
(* copy v1 into v0 for next iteration *)
Array.blit v1 0 v0 0 (Str.length s2 + 1);
done;
v1.(Str.length s2)
end
module NDA = struct
type char =
| Any
| Char of char_
type transition =
| Success
| Upon of char * int * int
| Epsilon of int * int
(* non deterministic automaton *)
type _t = transition list array array
let length nda = Array.length nda
let rec mem_tr tr l = match tr, l with
| _, [] -> false
| Success, Success::_ -> true
| Epsilon (i,j), Epsilon(i',j')::_ -> i=i' && j=j'
| Upon (Any,i,j), Upon(Any,i',j')::_ when i=i' && j=j' -> true
| Upon (Char c,i,j), Upon(Char c',i',j')::_
when Str.compare_char c c' = 0 && i=i' && j=j' -> true
| _, _::l' -> mem_tr tr l'
(* build NDA from the string *)
let make ~limit s =
let len = Str.length s in
let m = Array.make_matrix (len +1) (limit+1) [] in
let add_transition i j tr =
if not (mem_tr tr m.(i).(j))
then m.(i).(j) <- tr :: m.(i).(j)
in
(* internal transitions *)
for i = 0 to len-1 do
for j = 0 to limit do
(* correct char *)
add_transition i j (Upon (Char (Str.get s i), i+1, j));
(* other transitions *)
if j < limit then begin
(* substitution *)
add_transition i j (Upon (Any, i+1, j+1));
(* deletion in indexed string *)
add_transition i j (Upon (Any, i, j+1));
(* addition to indexed string *)
add_transition i j (Epsilon (i+1, j+1));
end
done
done;
for j = 0 to limit do
(* deletions at the end *)
if j < limit
then add_transition len j (Upon (Any, len, j+1));
(* win in any case *)
add_transition len j Success;
done;
m
let get nda (i,j) =
nda.(i).(j)
let is_final nda (i,j) =
List.exists
(function Success -> true | _ -> false)
(get nda (i,j))
end
(** deterministic automaton *)
module DFA = struct
type t = {
mutable transitions : (char_ * int) list array;
mutable is_final : bool array;
mutable otherwise : int array; (* transition by default *)
mutable len : int;
}
let create size = {
len = 0;
transitions = Array.make size [];
is_final = Array.make size false;
otherwise = Array.make size ~-1;
}
let _double_array ~init a =
let a' = Array.make (2 * Array.length a) init in
Array.blit a 0 a' 0 (Array.length a);
a'
(* add a new state *)
let add_state dfa =
let n = dfa.len in
(* resize *)
if n = Array.length dfa.transitions then begin
dfa.transitions <- _double_array ~init:[] dfa.transitions;
dfa.is_final <- _double_array ~init:false dfa.is_final;
dfa.otherwise <- _double_array ~init:~-1 dfa.otherwise;
end;
dfa.len <- n + 1;
n
let rec __mem_tr tr l = match tr, l with
| _, [] -> false
| (c,i), (c',i')::l' ->
(i=i' && compare c c' = 0)
|| __mem_tr tr l'
(* add transition *)
let add_transition dfa i tr =
if not (__mem_tr tr dfa.transitions.(i))
then dfa.transitions.(i) <- tr :: dfa.transitions.(i)
let add_otherwise dfa i j =
dfa.otherwise.(i) <- j
let set_final dfa i =
dfa.is_final.(i) <- true
(* set of pairs of ints: used for representing a set of states of the NDA *)
module NDAStateSet = Set.Make(struct
type t = int * int
let compare = Pervasives.compare
end)
let _set_to_string s =
let b = Buffer.create 15 in
Buffer.add_char b '{';
NDAStateSet.iter
(fun (x,y) -> Printf.bprintf b "(%d,%d)" x y)
s;
Buffer.add_char b '}';
Buffer.contents b
(* list of characters that can specifically be followed from the given set *)
let chars_from_set nda set =
NDAStateSet.fold
(fun state acc ->
let transitions = NDA.get nda state in
List.fold_left
(fun acc tr -> match tr with
| NDA.Upon (NDA.Char c, _, _) ->
if List.exists (fun c' -> Str.compare_char c c' = 0) acc
then acc
else c :: acc (* new char! *)
| _ -> acc
) acc transitions
) set []
(* saturate current set w.r.t epsilon links *)
let saturate_epsilon nda set =
let q = Queue.create () in
NDAStateSet.iter (fun s -> Queue.push s q) set;
let set = ref set in
while not (Queue.is_empty q) do
let state = Queue.pop q in
(*Printf.printf "saturate epsilon: add state %d,%d\n" (fst state)(snd state);*)
set := NDAStateSet.add state !set;
List.iter
(fun tr' -> match tr' with
| NDA.Epsilon (i,j) ->
if not (NDAStateSet.mem (i,j) !set)
then Queue.push (i,j) q
| _ -> ()
) (NDA.get nda state)
done;
!set
(* find the transition that matches the given char (if any), or "*";
may raise exceptions Not_found or LeadToSuccess. *)
let rec get_transition_for_char nda c acc transitions =
match transitions with
| NDA.Upon (NDA.Char c', i, j) :: transitions' when Str.compare_char c c' = 0 ->
(* follow same char *)
let acc = NDAStateSet.add (i, j) acc in
get_transition_for_char nda c acc transitions'
| NDA.Upon (NDA.Any, i, j) :: transitions' ->
(* follow '*' *)
let acc = NDAStateSet.add (i,j) acc in
get_transition_for_char nda c acc transitions'
| _ :: transitions' -> get_transition_for_char nda c acc transitions'
| [] -> acc
let rec get_transitions_for_any nda acc transitions =
match transitions with
| NDA.Upon (NDA.Char _, _, _) :: transitions' ->
get_transitions_for_any nda acc transitions'
| NDA.Upon (NDA.Any, i, j) :: transitions' ->
let acc = NDAStateSet.add (i,j) acc in
get_transitions_for_any nda acc transitions'
| _:: transitions' -> get_transitions_for_any nda acc transitions'
| [] -> acc
(* follow transition for given NDA.char, returns a new state
and a boolean indicating whether it's final *)
let follow_transition nda set c =
let set' = NDAStateSet.fold
(fun state acc ->
let transitions = NDA.get nda state in
(* among possible transitions, follow the one that matches c
the most closely *)
get_transition_for_char nda c acc transitions
) set NDAStateSet.empty
in
saturate_epsilon nda set'
let follow_transition_any nda set =
let set' = NDAStateSet.fold
(fun state acc ->
let transitions = NDA.get nda state in
(* among possible transitions, follow the ones that are labelled with "*" *)
get_transitions_for_any nda acc transitions
) set NDAStateSet.empty
in
saturate_epsilon nda set'
(* call [k] with every [transition'] that can be reached from [set], with
a bool that states whether it's final *)
let iterate_transition_set nda set k =
(*Printf.printf "iterate_transition at set %s\n" (set_to_string set);*)
(* all possible "fixed char" transitions *)
let chars = chars_from_set nda set in
List.iter
(fun c ->
(*Printf.printf "iterate_transition follows %c (at %s)\n"
(Obj.magic c) (set_to_string set);*)
let set' = follow_transition nda set c in
if not (NDAStateSet.is_empty set')
then k (NDA.Char c) set';
) chars;
(* remaining transitions, with only "Any" *)
(*Printf.printf "iterate transition follows * (at %s)\n" (set_to_string set);*)
let set' = follow_transition_any nda set in
if not (NDAStateSet.is_empty set')
then k NDA.Any set'
module StateSetMap = Map.Make(NDAStateSet)
(* get the state that corresponds to the given set of NDA states.
[states] is a map [nda set] -> [nfa state] *)
let get_state dfa states set =
try StateSetMap.find set !states
with Not_found ->
let i = add_state dfa in
states := StateSetMap.add set i !states;
i
(* traverse the NDA. Currently we're at [set] *)
let rec traverse nda dfa states set =
let set_i = get_state dfa states set in
(* does this set lead to success? *)
let is_final = NDAStateSet.exists (NDA.is_final nda) set in
if is_final
then set_final dfa set_i;
iterate_transition_set nda set
(fun c set' ->
(*Printf.printf "traverse %s --%c--> %s\n" (set_to_string set)
(match c with NDA.Char c' -> Obj.magic c' | NDA.Any -> '*')
(set_to_string set');*)
let set_i' = get_state dfa states set' in
(* link set -> set' *)
match c with
| NDA.Char c' ->
add_transition dfa set_i (c', set_i');
traverse nda dfa states set'
| NDA.Any ->
add_otherwise dfa set_i set_i';
traverse nda dfa states set'
)
let of_nda nda =
let dfa = create (NDA.length nda) in
(* map (set of NDA states) to int (state in DFA) *)
let states = ref StateSetMap.empty in
(* traverse the NDA to build the NFA *)
let set = NDAStateSet.singleton (0,0) in
let set = saturate_epsilon nda set in
traverse nda dfa states set;
(*StateSetMap.iter
(fun set i ->
Printf.printf "set %s --> state %d\n" (set_to_string set) i
) !states;*)
dfa
let get dfa i =
dfa.transitions.(i)
let otherwise dfa i =
dfa.otherwise.(i)
let is_final dfa i =
dfa.is_final.(i)
end
let debug_print pp_char oc dfa =
Printf.fprintf oc "automaton of %d states\n" dfa.DFA.len;
for i = 0 to dfa.DFA.len-1 do
let transitions = DFA.get dfa i in
if DFA.is_final dfa i
then Printf.fprintf oc " success %d\n" i;
List.iter
(fun (c, j) -> Printf.fprintf oc " %d --%a--> %d\n" i pp_char c j ) transitions;
let o = DFA.otherwise dfa i in
if o >= 0
then Printf.fprintf oc " %d --*--> %d\n" i o
done
type automaton = DFA.t
let of_string ~limit s =
let nda = NDA.make ~limit s in
let dfa = DFA.of_nda nda in
dfa
let of_list ~limit l =
of_string ~limit (Str.of_list l)
let rec __find_char c l = match l with
| [] -> raise Not_found
| (c', next) :: l' ->
if compare c c' = 0
then next
else __find_char c l'
(* transition for [c] in state [i] of [dfa];
@raise Not_found if no transition matches *)
let __transition dfa i c =
let transitions = DFA.get dfa i in
try
__find_char c transitions
with Not_found ->
let o = DFA.otherwise dfa i in
if o >= 0
then o
else raise Not_found
let match_with dfa a =
let len = Str.length a in
let rec search i state =
(*Printf.printf "at state %d (dist %d)\n" i dist;*)
if i = len
then DFA.is_final dfa state
else begin
(* current char *)
let c = Str.get a i in
try
let next = __transition dfa state c in
search (i+1) next
with Not_found -> false
end
in
search 0 0
(** {6 Index for one-to-many matching} *)
module Index = struct
type key = char_
module M = Map.Make(struct
type t = key
let compare = Str.compare_char
end)
type 'b t =
| Node of 'b option * 'b t M.t
let empty = Node (None, M.empty)
let is_empty = function
| Node (None, m) -> M.is_empty m
| _ -> false
let () = assert (is_empty empty)
(** get/add/remove the leaf for the given array.
the continuation k takes the leaf, and returns a leaf option
that replaces the old leaf.
This function returns the new trie. *)
let goto_leaf s node k =
let len = Str.length s in
(* insert the value in given [node], assuming the current index
in [arr] is [i]. [k] is given the resulting tree. *)
let rec goto node i rebuild = match node with
| _ when i = len ->
let node' = k node in
rebuild node'
| Node (opt, m) ->
let c = Str.get s i in
let t' =
try M.find c m
with Not_found -> empty
in
goto t' (i+1)
(fun t'' ->
if is_empty t''
then rebuild (Node (opt, M.remove c m))
else rebuild (Node (opt, M.add c t'' m)))
in
goto node 0 (fun t -> t)
let add trie s value =
goto_leaf s trie
(function
| Node (_, m) -> Node (Some value, m))
let remove trie s =
goto_leaf s trie
(function
| Node (_, m) -> Node (None, m))
(* traverse the automaton and the idx, yielding a klist of values *)
let retrieve ~limit idx s =
let dfa = of_string ~limit s in
(* traverse at index i in automaton, with
[fk] the failure continuation *)
let rec traverse node i ~(fk:'a klist) () =
match node with
| Node (opt, m) ->
(* all alternatives: continue exploring [m], or call [fk] *)
let fk =
M.fold
(fun c node' fk ->
try
let next = __transition dfa i c in
traverse node' next ~fk
with Not_found -> fk)
m fk
in
match opt with
| Some v when DFA.is_final dfa i ->
(* yield one solution now *)
`Cons (v, fk)
| _ -> fk () (* fail... or explore subtrees *)
in
traverse idx 0 ~fk:(fun () -> `Nil)
let of_list l =
List.fold_left
(fun acc (arr,v) -> add acc arr v)
empty l
let fold f acc idx =
let rec explore acc trail node = match node with
| Node (opt, m) ->
(* first, yield current value, if any *)
let acc = match opt with
| None -> acc
| Some v ->
let str = Str.of_list (List.rev trail) in
f acc str v
in
M.fold
(fun c node' acc -> explore acc (c::trail) node')
m acc
in
explore acc [] idx
let iter f idx =
fold (fun () str v -> f str v) () idx
let cardinal idx = fold (fun n _ _ -> n+1) 0 idx
let to_list idx =
fold (fun acc str v -> (str,v) :: acc) [] idx
let add_seq i s =
let i = ref i in
s (fun (arr,v) -> i := add !i arr v);
!i
let of_seq s = add_seq empty s
let to_seq i yield = iter (fun x y -> yield (x,y)) i
(*$Q
list_uniq_ (fun l -> \
Sequence.of_list l |> Index.of_seq |> Index.to_seq \
|> Sequence.to_list |> List.sort Pervasives.compare \
= List.sort Pervasives.compare l)
*)
let rec add_gen i g = match g() with
| None -> i
| Some (arr,v) -> add_gen (add i arr v) g
let of_gen g = add_gen empty g
let to_gen s =
let st = Stack.create () in
Stack.push ([],s) st;
let rec next () =
if Stack.is_empty st then None
else
let trail, Node (opt, m) = Stack.pop st in
(* explore children *)
M.iter
(fun c node' -> Stack.push (c::trail, node') st)
m;
match opt with
| None -> next()
| Some v ->
let str = Str.of_list (List.rev trail) in
Some (str,v)
in
next
(*$Q
list_uniq_ (fun l -> \
Gen.of_list l |> Index.of_gen |> Index.to_gen \
|> Gen.to_list |> List.sort Pervasives.compare \
= List.sort Pervasives.compare l)
*)
let to_klist idx =
let rec traverse node trail ~(fk:(string_*'a) klist) () =
let Node (opt, m) = node in
(* all alternatives: continue exploring [m], or call [fk] *)
let fk =
M.fold
(fun c node' fk -> traverse node' (c::trail) ~fk)
m fk
in
match opt with
| Some v ->
let str = Str.of_list (List.rev trail) in
`Cons ((str,v), fk)
| _ -> fk () (* fail... or explore subtrees *)
in
traverse idx [] ~fk:(fun () -> `Nil)
end
end
include Make(struct
type t = string
type char_ = char
let compare_char = Char.compare
let length = String.length
let get = String.get
let of_list l =
let buf = Buffer.create (List.length l) in
List.iter (fun c -> Buffer.add_char buf c) l;
Buffer.contents buf
end)
let debug_print = debug_print output_char
(*$T
edit_distance "foo" "fo0" = 1
edit_distance "foob" "foo" = 1
edit_distance "yolo" "yoyo" = 1
edit_distance "aaaaaaab" "aaaa" = 4
*)
(*
open Batteries;;
let words = File.with_file_in "/usr/share/dict/cracklib-small" (fun i -> IO.read_all i |> String.nsplit ~by:"\\n");;
let idx = List.fold_left (fun idx s -> Levenshtein.StrIndex.add_string idx s s) Levenshtein.StrIndex.empty words;;
Levenshtein.StrIndex.retrieve_string ~limit:1 idx "hell" |> Levenshtein.klist_to_list;;
*)

View file

@ -1,200 +0,0 @@
(*
copyright (c) 2013, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Levenshtein distance}
We take inspiration from
http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
for the main algorithm and ideas. However some parts are adapted *)
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
(** {2 Abstraction over Strings}
Due to the existence of several encodings and string representations we
abstract over the type of strings. A string is a finite array of characters
(8-bits char, unicode runes, etc.) which provides a length operation
and a function to access the n-th character. *)
module type STRING = sig
type char_
type t
val of_list : char_ list -> t
val get : t -> int -> char_
val length : t -> int
val compare_char : char_ -> char_ -> int
end
(** {2 Continuation list}
This data structure is used to represent a list of result that is
evaluated only as far as the user wants. If the user only wants a few elements,
she doesn't pay for the remaining ones.
In particular, when matching a string against a (big) set of indexed
strings, we return a continuation list so that, even if there are many results,
only those actually asked for are evaluated. *)
type 'a klist =
unit -> [
| `Nil
| `Cons of 'a * 'a klist
]
val klist_to_list : 'a klist -> 'a list
(** Helper for short lists. *)
(** {2 Signature}
The signature for a given string representation provides 3 main things:
- a [edit_distance] function to compute the edit distance between strings
- an [automaton] type that is built from a string [s] and a maximum distance [n],
and only accepts the strings [s'] such that [edit_distance s s' <= n].
- an [Index] module that can be used to map many strings to values, like
a regular string map, but for which retrieval is fuzzy (for a given
maximal distance).
A possible use of the index could be:
{[
let words = CCIO.with_in "/usr/share/dict/words"
(fun i -> CCIO.read_all i |> CCString.Split.list_cpy ~by:"\n");;
let words = List.map (fun s->s,s) words;;
let idx = CCLevenshtein.Index.of_list words;;
CCLevenshtein.Index.retrieve ~limit:1 idx "hell" |> CCLevenshtein.klist_to_list;;
]}
*)
module type S = sig
type char_
type string_
(** {6 Edit Distance} *)
val edit_distance : string_ -> string_ -> int
(** Edition distance between two strings. This satisfies the classical
distance axioms: it is always positive, symmetric, and satisfies
the formula [distance a b + distance b c >= distance a c] *)
(** {6 Automaton}
An automaton, built from a string [s] and a limit [n], that accepts
every string that is at distance at most [n] from [s]. *)
type automaton
(** Levenshtein automaton *)
val of_string : limit:int -> string_ -> automaton
(** Build an automaton from a string, with a maximal distance [limit].
The automaton will accept strings whose {!edit_distance} to the
parameter is at most [limit]. *)
val of_list : limit:int -> char_ list -> automaton
(** Build an automaton from a list, with a maximal distance [limit] *)
val debug_print : (out_channel -> char_ -> unit) ->
out_channel -> automaton -> unit
(** Output the automaton's structure on the given channel. *)
val match_with : automaton -> string_ -> bool
(** [match_with a s] matches the string [s] against [a], and returns
[true] if the distance from [s] to the word represented by [a] is smaller
than the limit used to build [a] *)
(** {6 Index for one-to-many matching} *)
module Index : sig
type 'b t
(** Index that maps strings to values of type 'b. Internally it is
based on a trie. A string can only map to one value. *)
val empty : 'b t
(** Empty index *)
val is_empty : _ t -> bool
val add : 'b t -> string_ -> 'b -> 'b t
(** Add a pair string/value to the index. If a value was already present
for this string it is replaced. *)
val cardinal : _ t -> int
(** Number of bindings *)
val remove : 'b t -> string_ -> 'b t
(** Remove a string (and its associated value, if any) from the index. *)
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
(** Lazy list of objects associated to strings close to the query string *)
val of_list : (string_ * 'b) list -> 'b t
(** Build an index from a list of pairs of strings and values *)
val to_list : 'b t -> (string_ * 'b) list
(** Extract a list of pairs from an index *)
val add_seq : 'a t -> (string_ * 'a) sequence -> 'a t
(** @since 0.14 *)
val of_seq : (string_ * 'a) sequence -> 'a t
(** @since 0.14 *)
val to_seq : 'a t -> (string_ * 'a) sequence
(** @since 0.14 *)
val add_gen : 'a t -> (string_ * 'a) gen -> 'a t
(** @since 0.14 *)
val of_gen : (string_ * 'a) gen -> 'a t
(** @since 0.14 *)
val to_gen : 'a t -> (string_ * 'a) gen
(** @since 0.14 *)
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
(** Fold over the stored pairs string/value *)
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
(** Iterate on the pairs *)
val to_klist : 'b t -> (string_ * 'b) klist
(** Conversion to an iterator *)
end
end
(** {2 Functor} *)
module Make(Str : STRING) : S
with type string_ = Str.t
and type char_ = Str.char_
(** {2 Default instance: string} *)
include S with type char_ = char and type string_ = string
val debug_print : out_channel -> automaton -> unit

View file

@ -1,29 +0,0 @@
(*
copyright (c) 2013-2015, simon cruanes
all rights reserved.
redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
module App_parse = CCApp_parse
module Parse = CCParse
module KMP = CCKMP
module Levenshtein = CCLevenshtein