mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 11:15:31 -05:00
remove containers.string
This commit is contained in:
parent
bd7a9ce070
commit
c3e6e798e6
8 changed files with 4 additions and 2196 deletions
13
README.adoc
13
README.adoc
|
|
@ -22,11 +22,6 @@ cross-module dependencies).
|
||||||
containers.data:: with additional data structures that don't have an
|
containers.data:: with additional data structures that don't have an
|
||||||
equivalent in the standard library;
|
equivalent in the standard library;
|
||||||
containers.iter:: with list-like and tree-like iterators;
|
containers.iter:: with list-like and tree-like iterators;
|
||||||
containers.string:: (in directory `string`) with
|
|
||||||
a few packed modules that deal with strings (Levenshtein distance,
|
|
||||||
KMP search algorithm, and a few naive utils). Again, modules are independent
|
|
||||||
and sometimes parametric on the string and char types (so they should
|
|
||||||
be able to deal with your favorite unicode library).
|
|
||||||
|
|
||||||
- A sub-library with complicated abstractions, `containers.advanced` (with
|
- A sub-library with complicated abstractions, `containers.advanced` (with
|
||||||
a LINQ-like query module, batch operations using GADTs, and others).
|
a LINQ-like query module, batch operations using GADTs, and others).
|
||||||
|
|
@ -191,14 +186,6 @@ Iterators:
|
||||||
- `CCKList`, a persistent iterator structure (akin to a lazy list, without memoization)
|
- `CCKList`, a persistent iterator structure (akin to a lazy list, without memoization)
|
||||||
- `CCKTree`, an abstract lazy tree structure
|
- `CCKTree`, an abstract lazy tree structure
|
||||||
|
|
||||||
=== String
|
|
||||||
|
|
||||||
See http://cedeela.fr/~simon/software/containers/Containers_string[doc].
|
|
||||||
|
|
||||||
In the module `Containers_string`:
|
|
||||||
- `Levenshtein`: edition distance between two strings
|
|
||||||
- `KMP`: Knuth-Morris-Pratt substring algorithm
|
|
||||||
- `Parse`: simple parser combinators
|
|
||||||
|
|
||||||
=== Thread
|
=== Thread
|
||||||
|
|
||||||
|
|
|
||||||
18
_oasis
18
_oasis
|
|
@ -78,13 +78,6 @@ Library "containers_iter"
|
||||||
FindlibParent: containers
|
FindlibParent: containers
|
||||||
FindlibName: iter
|
FindlibName: iter
|
||||||
|
|
||||||
Library "containers_string"
|
|
||||||
Path: src/string
|
|
||||||
Modules: Containers_string, CCLevenshtein, CCApp_parse
|
|
||||||
BuildDepends: bytes
|
|
||||||
FindlibName: string
|
|
||||||
FindlibParent: containers
|
|
||||||
|
|
||||||
Library "containers_thread"
|
Library "containers_thread"
|
||||||
Path: src/threads/
|
Path: src/threads/
|
||||||
Modules: CCPool, CCLock, CCSemaphore, CCThread, CCBlockingQueue,
|
Modules: CCPool, CCLock, CCSemaphore, CCThread, CCBlockingQueue,
|
||||||
|
|
@ -102,7 +95,6 @@ Library "containers_top"
|
||||||
FindlibName: top
|
FindlibName: top
|
||||||
FindlibParent: containers
|
FindlibParent: containers
|
||||||
BuildDepends: compiler-libs.common, containers, containers.data,
|
BuildDepends: compiler-libs.common, containers, containers.data,
|
||||||
containers.bigarray, containers.string,
|
|
||||||
containers.unix, containers.sexp, containers.iter
|
containers.unix, containers.sexp, containers.iter
|
||||||
|
|
||||||
Document containers
|
Document containers
|
||||||
|
|
@ -116,8 +108,7 @@ Document containers
|
||||||
"-docflags '-colorize-code -short-functors -charset utf-8'"
|
"-docflags '-colorize-code -short-functors -charset utf-8'"
|
||||||
XOCamlbuildLibraries:
|
XOCamlbuildLibraries:
|
||||||
containers, containers.iter, containers.data,
|
containers, containers.iter, containers.data,
|
||||||
containers.string, containers.thread,
|
containers.thread, containers.unix, containers.sexp
|
||||||
containers.unix, containers.sexp
|
|
||||||
|
|
||||||
Executable run_benchs
|
Executable run_benchs
|
||||||
Path: benchs/
|
Path: benchs/
|
||||||
|
|
@ -126,7 +117,7 @@ Executable run_benchs
|
||||||
Build$: flag(bench)
|
Build$: flag(bench)
|
||||||
MainIs: run_benchs.ml
|
MainIs: run_benchs.ml
|
||||||
BuildDepends: containers, qcheck,
|
BuildDepends: containers, qcheck,
|
||||||
containers.data, containers.string, containers.iter,
|
containers.data, containers.iter,
|
||||||
containers.thread, sequence, gen, benchmark, hamt
|
containers.thread, sequence, gen, benchmark, hamt
|
||||||
|
|
||||||
Executable run_bench_hash
|
Executable run_bench_hash
|
||||||
|
|
@ -145,9 +136,8 @@ Executable run_qtest
|
||||||
CompiledObject: best
|
CompiledObject: best
|
||||||
MainIs: run_qtest.ml
|
MainIs: run_qtest.ml
|
||||||
Build$: flag(tests) && flag(bigarray) && flag(unix)
|
Build$: flag(tests) && flag(bigarray) && flag(unix)
|
||||||
BuildDepends: containers, containers.string, containers.iter,
|
BuildDepends: containers, containers.iter,
|
||||||
containers.sexp,
|
containers.sexp, containers.unix, containers.thread,
|
||||||
containers.unix, containers.thread,
|
|
||||||
containers.data,
|
containers.data,
|
||||||
sequence, gen, unix, oUnit, qcheck
|
sequence, gen, unix, oUnit, qcheck
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -120,17 +120,6 @@ CCKList
|
||||||
CCKTree
|
CCKTree
|
||||||
CCLazy_list}
|
CCLazy_list}
|
||||||
|
|
||||||
{4 String}
|
|
||||||
|
|
||||||
{b findlib name}: containers.string
|
|
||||||
|
|
||||||
{!modules:
|
|
||||||
CCApp_parse
|
|
||||||
CCKMP
|
|
||||||
CCLevenshtein
|
|
||||||
CCParse
|
|
||||||
}
|
|
||||||
|
|
||||||
{4 Misc}
|
{4 Misc}
|
||||||
|
|
||||||
Moved to its own repository.
|
Moved to its own repository.
|
||||||
|
|
|
||||||
|
|
@ -1,834 +0,0 @@
|
||||||
(*
|
|
||||||
copyright (c) 2013-2015, simon cruanes
|
|
||||||
all rights reserved.
|
|
||||||
|
|
||||||
redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are met:
|
|
||||||
|
|
||||||
redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer. redistributions in binary
|
|
||||||
form must reproduce the above copyright notice, this list of conditions and the
|
|
||||||
following disclaimer in the documentation and/or other materials provided with
|
|
||||||
the distribution.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
||||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*)
|
|
||||||
|
|
||||||
(** {1 Applicative Parser Combinators} *)
|
|
||||||
|
|
||||||
type ('a,'b) result = [`Error of 'b | `Ok of 'a]
|
|
||||||
|
|
||||||
type multiplicity =
|
|
||||||
| Star (* 0 or more *)
|
|
||||||
| Plus (* 1 or more *)
|
|
||||||
| Question (* 0 or 1 *)
|
|
||||||
|
|
||||||
let str fmt = Printf.sprintf fmt
|
|
||||||
|
|
||||||
module CharSet = Set.Make(Char)
|
|
||||||
module CharMap = Map.Make(Char)
|
|
||||||
|
|
||||||
let print_char = function
|
|
||||||
| '\t' -> "\\t"
|
|
||||||
| '\n' -> "\\n"
|
|
||||||
| '\r' -> "\\r"
|
|
||||||
| '"' -> "\\\""
|
|
||||||
| c -> str "%c" c
|
|
||||||
|
|
||||||
let print_char_set set =
|
|
||||||
let buf = Buffer.create 32 in
|
|
||||||
Buffer.add_char buf '"';
|
|
||||||
CharSet.iter (fun c -> Buffer.add_string buf (print_char c)) set;
|
|
||||||
Buffer.add_char buf '"';
|
|
||||||
Buffer.contents buf
|
|
||||||
|
|
||||||
let domain_of_char_map m =
|
|
||||||
CharMap.fold (fun c _ set -> CharSet.add c set) m CharSet.empty
|
|
||||||
|
|
||||||
let print_char_map map =
|
|
||||||
let l = CharMap.fold
|
|
||||||
(fun c _ acc -> print_char c :: acc) map [] in
|
|
||||||
String.concat ", " l
|
|
||||||
|
|
||||||
let ppmap ?(sep=", ") pp_k pp_v fmt m =
|
|
||||||
let first = ref true in
|
|
||||||
CharMap.iter
|
|
||||||
(fun k v ->
|
|
||||||
if !first then first := false else Format.pp_print_string fmt sep;
|
|
||||||
pp_k fmt k;
|
|
||||||
Format.pp_print_string fmt " → ";
|
|
||||||
pp_v fmt v;
|
|
||||||
Format.pp_print_cut fmt ()
|
|
||||||
) m;
|
|
||||||
()
|
|
||||||
|
|
||||||
let set_of_string s =
|
|
||||||
let set = ref CharSet.empty in
|
|
||||||
String.iter
|
|
||||||
(fun c ->
|
|
||||||
if CharSet.mem c !set
|
|
||||||
then invalid_arg (str "any_of: duplicate char %c" c);
|
|
||||||
set := CharSet.add c !set
|
|
||||||
) s;
|
|
||||||
!set
|
|
||||||
|
|
||||||
(* add [c -> p] to the map, for every [c] in [set] *)
|
|
||||||
let map_add_set init set p =
|
|
||||||
CharSet.fold
|
|
||||||
(fun c map -> CharMap.add c p map)
|
|
||||||
set init
|
|
||||||
|
|
||||||
(* function composition *)
|
|
||||||
let compose f g x = f (g x)
|
|
||||||
|
|
||||||
let str_of_l l =
|
|
||||||
let b = Bytes.make (List.length l) ' ' in
|
|
||||||
List.iteri (fun i c -> Bytes.set b i c) l;
|
|
||||||
Bytes.unsafe_to_string b
|
|
||||||
|
|
||||||
type 'a t = {
|
|
||||||
mutable st : 'a parse_or_compiled;
|
|
||||||
}
|
|
||||||
|
|
||||||
(* syntactic version *)
|
|
||||||
and _ parse =
|
|
||||||
| Many : 'a t * unit t * multiplicity -> 'a list parse
|
|
||||||
| Skip : 'a t * multiplicity -> unit parse (* same as Many, but ignores *)
|
|
||||||
| Lazy : 'a t lazy_t -> 'a parse
|
|
||||||
|
|
||||||
(* compiled version *)
|
|
||||||
and _ compiled =
|
|
||||||
| C_Return : 'a -> 'a compiled
|
|
||||||
| C_Map : ('a -> 'b) * 'a t -> 'b compiled
|
|
||||||
| C_Filter: ('a -> bool) * 'a t -> 'a compiled
|
|
||||||
| C_App : ('a -> 'b) t * 'a t -> 'b compiled
|
|
||||||
| C_AppLeft : 'a t * 'b t -> 'a compiled
|
|
||||||
| C_AppRight : 'a t * 'b t -> 'b compiled
|
|
||||||
| C_Fail : string -> 'a compiled
|
|
||||||
| C_Int : int compiled
|
|
||||||
| C_Float : float compiled
|
|
||||||
| C_Junk : unit compiled (* ignore next char *)
|
|
||||||
| C_AnyOf : CharSet.t -> char compiled
|
|
||||||
| C_SwitchC : 'a t CharMap.t * 'a t option -> 'a compiled
|
|
||||||
| C_Eof : unit compiled
|
|
||||||
|
|
||||||
and 'a parse_or_compiled =
|
|
||||||
| Parse of 'a parse
|
|
||||||
| Compiled of 'a compiled
|
|
||||||
|
|
||||||
(** {2 Helpers} *)
|
|
||||||
|
|
||||||
(* build a new parser *)
|
|
||||||
let make p = {st=Parse p}
|
|
||||||
let make_c c = {st=Compiled c}
|
|
||||||
let make_pc st = {st}
|
|
||||||
|
|
||||||
let ppmult fmt = function
|
|
||||||
| Star -> Format.pp_print_string fmt "*"
|
|
||||||
| Plus -> Format.pp_print_string fmt "+"
|
|
||||||
| Question -> Format.pp_print_string fmt "?"
|
|
||||||
|
|
||||||
let print fmt p =
|
|
||||||
let depth = ref 0 in
|
|
||||||
(* print up to a given limit into lazy values *)
|
|
||||||
let rec print_aux
|
|
||||||
: type a. Format.formatter -> a t -> unit
|
|
||||||
= fun fmt p ->
|
|
||||||
let ppstr = Format.pp_print_string
|
|
||||||
and ppf fmt x = Format.fprintf fmt x in
|
|
||||||
let ppc fmt c = ppf fmt "'%s'" (print_char c) in
|
|
||||||
match p.st with
|
|
||||||
| Compiled (C_Return _) -> ppstr fmt "<ret>"
|
|
||||||
| Compiled (C_Map (_, x)) -> ppf fmt "@[(map@ %a)@]" print_aux x
|
|
||||||
| Compiled (C_Filter (_, x)) -> ppf fmt "@[(filter@ %a)@]" print_aux x
|
|
||||||
| Compiled (C_App (f, x)) -> ppf fmt "@[<2>@[%a@]@ <*>@ @[%a@]@]" print_aux f print_aux x
|
|
||||||
| Compiled (C_AppLeft (a, b)) -> ppf fmt "@[%a@ <<@ %a@]" print_aux a print_aux b
|
|
||||||
| Compiled (C_AppRight (a, b)) -> ppf fmt "@[%a@ >>@ %a@]" print_aux a print_aux b
|
|
||||||
| Compiled (C_Fail _) -> ppf fmt "<fail>"
|
|
||||||
| Compiled C_Int -> ppstr fmt "<int>"
|
|
||||||
| Compiled C_Float -> ppstr fmt "<float>"
|
|
||||||
| Compiled C_Junk -> ppstr fmt "<junk>"
|
|
||||||
| Compiled (C_AnyOf set) -> ppf fmt "@[(any@ %s)@]" (print_char_set set)
|
|
||||||
| Parse (Many (p, sep, mult)) ->
|
|
||||||
ppf fmt "@[<2>(@[%a@]@ sep:@[%a@])%a@]" print_aux p print_aux sep ppmult mult
|
|
||||||
| Parse (Skip (p, mult)) ->
|
|
||||||
ppf fmt "@[<2>(skip @[%a@]%a)@]" print_aux p ppmult mult
|
|
||||||
| Compiled (C_SwitchC (map, None)) ->
|
|
||||||
ppf fmt "@[<hv2>(switch@ @[%a@])@]" (ppmap ppc print_aux) map
|
|
||||||
| Compiled (C_SwitchC (map, Some o)) ->
|
|
||||||
ppf fmt "@[<hv2>(switch@ @[%a@]@ or:%a)@]" (ppmap ppc print_aux) map print_aux o
|
|
||||||
| Parse (Lazy _) when !depth > 3 -> ppf fmt "<lazy>"
|
|
||||||
| Parse (Lazy (lazy p)) ->
|
|
||||||
incr depth;
|
|
||||||
print_aux fmt p;
|
|
||||||
decr depth
|
|
||||||
| Compiled C_Eof -> ppstr fmt "<eof>"
|
|
||||||
in
|
|
||||||
print_aux fmt p
|
|
||||||
|
|
||||||
let int_first_char = lazy (set_of_string "-0123456789")
|
|
||||||
let float_first_char = lazy (set_of_string ".-0123456789")
|
|
||||||
|
|
||||||
(* a set of characters that are valid as first characters of a parser *)
|
|
||||||
type possible_first_chars =
|
|
||||||
| Set of CharSet.t
|
|
||||||
| AllChars
|
|
||||||
| NoChar
|
|
||||||
| NoCharOrSet of CharSet.t (* either no char, or something starting with set *)
|
|
||||||
| IsFail of string
|
|
||||||
|
|
||||||
let ret_set set = match CharSet.cardinal set with
|
|
||||||
| 0 -> NoChar
|
|
||||||
| 256 -> AllChars
|
|
||||||
| _ -> Set set
|
|
||||||
|
|
||||||
let ret_no_char_or set = match CharSet.cardinal set with
|
|
||||||
| 0 -> NoChar
|
|
||||||
| 256 -> AllChars
|
|
||||||
| _ -> NoCharOrSet set
|
|
||||||
|
|
||||||
(* pfc of parsing a or b *)
|
|
||||||
let union_pfc a b = match a, b with
|
|
||||||
| Set a, Set b -> ret_set (CharSet.union a b)
|
|
||||||
| NoCharOrSet s, Set s'
|
|
||||||
| Set s', NoCharOrSet s -> ret_no_char_or (CharSet.union s s')
|
|
||||||
| NoChar, Set s
|
|
||||||
| Set s, NoChar -> ret_no_char_or s
|
|
||||||
| NoCharOrSet s, NoCharOrSet s' -> ret_no_char_or (CharSet.union s s')
|
|
||||||
| IsFail e, _ | _, IsFail e -> IsFail e
|
|
||||||
| AllChars, _ | _, AllChars -> AllChars
|
|
||||||
| NoChar, o | o, NoChar -> o
|
|
||||||
|
|
||||||
(* pfc of parsing a then b *)
|
|
||||||
let then_pfc a b = match a, b with
|
|
||||||
| Set a, Set b -> ret_set (CharSet.union a b)
|
|
||||||
| NoCharOrSet s, NoCharOrSet s' -> ret_no_char_or (CharSet.union s s')
|
|
||||||
| NoCharOrSet s, Set s' -> ret_set (CharSet.union s s')
|
|
||||||
| NoCharOrSet s, NoChar -> ret_no_char_or s
|
|
||||||
| Set s, _ -> ret_set s
|
|
||||||
| IsFail e, _ | _, IsFail e -> IsFail e
|
|
||||||
| AllChars, _ | _, AllChars -> AllChars
|
|
||||||
| NoChar, o -> o
|
|
||||||
|
|
||||||
let (<|||>) a b = match a with
|
|
||||||
| NoChar -> Lazy.force b
|
|
||||||
| NoCharOrSet _ -> then_pfc a (Lazy.force b)
|
|
||||||
| _ -> a
|
|
||||||
|
|
||||||
(* set of possibilities for the first char of a parser *)
|
|
||||||
let rec pfc : type a. a t -> possible_first_chars = fun t -> pfc_pc t.st
|
|
||||||
|
|
||||||
and pfc_pc
|
|
||||||
: type a. a parse_or_compiled -> possible_first_chars
|
|
||||||
= function
|
|
||||||
| Parse p -> pfc_p p
|
|
||||||
| Compiled c -> pfc_c c
|
|
||||||
|
|
||||||
and pfc_p
|
|
||||||
: type a. a parse -> possible_first_chars
|
|
||||||
= function
|
|
||||||
| Many (p, _, (Question | Star)) -> union_pfc (pfc p) NoChar
|
|
||||||
| Many (p, _, Plus) -> pfc p
|
|
||||||
| Skip (p, (Question | Star)) -> union_pfc (pfc p) NoChar
|
|
||||||
| Skip (p, Plus) -> pfc p
|
|
||||||
| Lazy (lazy p) -> pfc p
|
|
||||||
|
|
||||||
and pfc_c
|
|
||||||
: type a. a compiled -> possible_first_chars
|
|
||||||
= function
|
|
||||||
| C_Return _ -> NoChar
|
|
||||||
| C_Map (_, x) -> pfc x
|
|
||||||
| C_Filter (_, x) -> pfc x
|
|
||||||
| C_App (f, x) -> pfc f <|||> lazy (pfc x)
|
|
||||||
| C_AppLeft (a, b) -> pfc a <|||> lazy (pfc b)
|
|
||||||
| C_AppRight (a, b) -> pfc a <|||> lazy (pfc b)
|
|
||||||
| C_Fail e -> IsFail e
|
|
||||||
| C_Int -> Set (Lazy.force int_first_char)
|
|
||||||
| C_Float -> Set (Lazy.force float_first_char)
|
|
||||||
| C_Junk -> AllChars
|
|
||||||
| C_AnyOf set -> ret_set set
|
|
||||||
| C_SwitchC (map, None) -> ret_set (domain_of_char_map map)
|
|
||||||
| C_SwitchC (map, Some o) ->
|
|
||||||
let s = domain_of_char_map map in
|
|
||||||
union_pfc (ret_set s) (pfc o)
|
|
||||||
| C_Eof -> NoChar
|
|
||||||
|
|
||||||
let possible_first_chars = pfc
|
|
||||||
|
|
||||||
(** {2 Combinators} *)
|
|
||||||
|
|
||||||
let return x = make_c (C_Return x)
|
|
||||||
let pure = return
|
|
||||||
|
|
||||||
let success = pure ()
|
|
||||||
|
|
||||||
let fail msg = make_c (C_Fail msg)
|
|
||||||
|
|
||||||
let junk = make_c C_Junk
|
|
||||||
|
|
||||||
let failf fmt = Printf.ksprintf (fun msg -> fail msg) fmt
|
|
||||||
|
|
||||||
let map f x = match x.st with
|
|
||||||
| Compiled (C_Map (g, y)) -> make_c (C_Map (compose f g, y))
|
|
||||||
| Compiled (C_Return x) -> pure (f x)
|
|
||||||
| _ -> make_c (C_Map (f, x))
|
|
||||||
|
|
||||||
let app f x = match f.st with
|
|
||||||
| Compiled (C_Return f) -> map f x
|
|
||||||
| _ -> make_c (C_App (f, x))
|
|
||||||
|
|
||||||
let fun_and f f' x = f x && f' x
|
|
||||||
|
|
||||||
let filter f x = match x.st with
|
|
||||||
| Compiled (C_Return y) -> if f y then return y else fail "filter failed"
|
|
||||||
| Compiled (C_Filter (f', y)) -> make_c (C_Filter (fun_and f f', y))
|
|
||||||
| _ -> make_c (C_Filter (f, x))
|
|
||||||
|
|
||||||
let app_left a b = make_c (C_AppLeft (a, b)) (* return (fun x y -> x) <*> a <*> b *)
|
|
||||||
|
|
||||||
let app_right a b = make_c (C_AppRight (a, b)) (* return (fun x y -> y) <*> a <*> b *)
|
|
||||||
|
|
||||||
let int = make_c C_Int
|
|
||||||
|
|
||||||
let float = make_c C_Float
|
|
||||||
|
|
||||||
let many ?(sep=success) p = make (Many (p, sep, Star))
|
|
||||||
|
|
||||||
let many1 ?(sep=success) p = make (Many (p, sep, Plus))
|
|
||||||
|
|
||||||
let skip p = make (Skip (p, Star))
|
|
||||||
|
|
||||||
let skip1 p = make (Skip (p, Plus))
|
|
||||||
|
|
||||||
let opt p =
|
|
||||||
map
|
|
||||||
(function
|
|
||||||
| [x] -> Some x
|
|
||||||
| [] -> None
|
|
||||||
| _ -> assert false
|
|
||||||
) (make (Many (p, success, Question)))
|
|
||||||
|
|
||||||
let any_of' s = make_c (C_AnyOf s)
|
|
||||||
let any_of s = any_of' (set_of_string s)
|
|
||||||
|
|
||||||
let char c = any_of' (CharSet.singleton c)
|
|
||||||
|
|
||||||
let spaces = skip (any_of " \t")
|
|
||||||
let spaces1 = skip1 (any_of " \t")
|
|
||||||
|
|
||||||
let white = skip (any_of " \t\n")
|
|
||||||
let white1 = skip1 (any_of " \t\n")
|
|
||||||
|
|
||||||
let alpha_lower_ = set_of_string "abcdefghijklmonpqrstuvwxyz"
|
|
||||||
let alpha_upper_ = set_of_string "ABCDEFGHIJKLMONPQRSTUVWXYZ"
|
|
||||||
let num_ = set_of_string "0123456789"
|
|
||||||
let alpha_ = CharSet.union alpha_lower_ alpha_upper_
|
|
||||||
let symbols_ = set_of_string "|!;$#@%&-_/="
|
|
||||||
|
|
||||||
let alpha_lower = any_of' alpha_lower_
|
|
||||||
let alpha_upper = any_of' alpha_upper_
|
|
||||||
let num = any_of' num_
|
|
||||||
let symbols = any_of' symbols_
|
|
||||||
let alpha = any_of' alpha_
|
|
||||||
let alpha_num = any_of' (CharSet.union num_ alpha_)
|
|
||||||
|
|
||||||
let eof = make_c C_Eof
|
|
||||||
|
|
||||||
let switch_c ?default l =
|
|
||||||
if l = [] then match default with
|
|
||||||
| None -> invalid_arg "switch_c: empty list";
|
|
||||||
| Some d -> d
|
|
||||||
else
|
|
||||||
let map = List.fold_left
|
|
||||||
(fun map (c, t) ->
|
|
||||||
if CharMap.mem c map
|
|
||||||
then invalid_arg (str "switch_c: duplicate char %c" c);
|
|
||||||
CharMap.add c t map
|
|
||||||
) CharMap.empty l
|
|
||||||
in
|
|
||||||
make_c (C_SwitchC (map, default))
|
|
||||||
|
|
||||||
exception ExnIsFail of string
|
|
||||||
|
|
||||||
let make_switch_c a b = make_c (C_SwitchC (a, b))
|
|
||||||
|
|
||||||
(* binary choice: compiled into decision tree *)
|
|
||||||
let rec merge a b =
|
|
||||||
(* build a switch by first char *)
|
|
||||||
try
|
|
||||||
begin match a.st, b.st with
|
|
||||||
| Compiled (C_SwitchC (map_a, def_a)),
|
|
||||||
Compiled (C_SwitchC (map_b, def_b)) ->
|
|
||||||
(* merge jump tables *)
|
|
||||||
let def = match def_a, def_b with
|
|
||||||
| None, None -> None
|
|
||||||
| Some d, None
|
|
||||||
| None, Some d -> Some d
|
|
||||||
| Some _, Some _ ->
|
|
||||||
invalid_arg "choice: ambiguous, several parsers accept any input"
|
|
||||||
in
|
|
||||||
let map = CharMap.merge
|
|
||||||
(fun _ a b -> match a, b with
|
|
||||||
| Some a', Some b' -> Some (merge a' b')
|
|
||||||
| Some m, None
|
|
||||||
| None, Some m -> Some m
|
|
||||||
| None, None -> assert false
|
|
||||||
) map_a map_b
|
|
||||||
in
|
|
||||||
make_switch_c map def
|
|
||||||
| Compiled (C_SwitchC (map, def)), other
|
|
||||||
| other, Compiled (C_SwitchC (map, def)) ->
|
|
||||||
let map', def' = match pfc_pc other, def with
|
|
||||||
| AllChars, _ ->
|
|
||||||
invalid_arg "choice: ambiguous, several parsers accept any input"
|
|
||||||
| NoChar, None -> map, Some (make_pc other)
|
|
||||||
| NoChar, Some _ ->
|
|
||||||
invalid_arg "choice: ambiguous"
|
|
||||||
| IsFail msg, _ -> raise (ExnIsFail msg)
|
|
||||||
| NoCharOrSet set, def
|
|
||||||
| Set set, def ->
|
|
||||||
if CharSet.exists (fun c -> CharMap.mem c map) set
|
|
||||||
then invalid_arg
|
|
||||||
(str "choice: ambiguous parsers (overlap on {%s})"
|
|
||||||
(print_char_set (CharSet.inter set (domain_of_char_map map))));
|
|
||||||
(* else: merge jump tables *)
|
|
||||||
let map = map_add_set map set (make_pc other) in
|
|
||||||
map, def
|
|
||||||
in
|
|
||||||
make_switch_c map' def'
|
|
||||||
| _ ->
|
|
||||||
begin match possible_first_chars a, possible_first_chars b with
|
|
||||||
| (Set set1 | NoCharOrSet set1), (Set set2 | NoCharOrSet set2) ->
|
|
||||||
if CharSet.exists (fun c -> CharSet.mem c set2) set1
|
|
||||||
then invalid_arg
|
|
||||||
(str "choice: ambiguous parsers (overlap on {%s})"
|
|
||||||
(print_char_set (CharSet.inter set1 set2)));
|
|
||||||
let map = map_add_set CharMap.empty set1 a in
|
|
||||||
let map = map_add_set map set2 b in
|
|
||||||
make_switch_c map None
|
|
||||||
| IsFail e, _ | _, IsFail e -> raise (ExnIsFail e)
|
|
||||||
| Set s, NoChar -> make_switch_c (map_add_set CharMap.empty s a) (Some b)
|
|
||||||
| NoChar, Set s -> make_switch_c (map_add_set CharMap.empty s b) (Some a)
|
|
||||||
| AllChars, _ | _, AllChars ->
|
|
||||||
invalid_arg "choice: ambiguous parsers (one accepts everything)"
|
|
||||||
| (NoChar | NoCharOrSet _), (NoChar | NoCharOrSet _) ->
|
|
||||||
invalid_arg "choice: ambiguous parsers (both accept nothing)"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
with ExnIsFail msg -> make_c (C_Fail msg)
|
|
||||||
|
|
||||||
let rec choice = function
|
|
||||||
| [] -> invalid_arg "choice: empty list";
|
|
||||||
| [x] -> x
|
|
||||||
| a :: tl -> merge a (choice tl)
|
|
||||||
|
|
||||||
(* temporary structure for buildings switches *)
|
|
||||||
type 'a trie =
|
|
||||||
| TrieLeaf of 'a t
|
|
||||||
| TrieNode of 'a trie CharMap.t
|
|
||||||
|
|
||||||
let trie_empty = TrieNode CharMap.empty
|
|
||||||
|
|
||||||
let rec parser_of_trie : type a. a trie -> a t = function
|
|
||||||
| TrieLeaf p -> p
|
|
||||||
| TrieNode m ->
|
|
||||||
make_switch_c (CharMap.map parser_of_trie' m) None
|
|
||||||
(* consume next char, then build sub-trie *)
|
|
||||||
and parser_of_trie'
|
|
||||||
: type a. a trie -> a t
|
|
||||||
= fun x -> app_right junk (parser_of_trie x)
|
|
||||||
|
|
||||||
(* build prefix trie *)
|
|
||||||
let switch_s l =
|
|
||||||
if l = [] then invalid_arg "switch_s: empty list";
|
|
||||||
(* add parser p in trie [t], with key slice of [s] starting at [i] *)
|
|
||||||
let rec add_trie t s i p =
|
|
||||||
if i = String.length s
|
|
||||||
then match t with
|
|
||||||
| TrieNode m when CharMap.is_empty m -> TrieLeaf p
|
|
||||||
| TrieNode _ -> invalid_arg (str "key \"%s\" is prefix of another key" s)
|
|
||||||
| TrieLeaf _ -> invalid_arg (str "duplicate key \"%s\"" s)
|
|
||||||
else
|
|
||||||
let c = String.get s i in
|
|
||||||
match t with
|
|
||||||
| TrieLeaf _ ->
|
|
||||||
invalid_arg (str "key \"%s\" is prefixed by another key" s)
|
|
||||||
| TrieNode map ->
|
|
||||||
try
|
|
||||||
let sub = CharMap.find c map in
|
|
||||||
let sub = add_trie sub s (i+1) p in
|
|
||||||
TrieNode (CharMap.add c sub map)
|
|
||||||
with Not_found ->
|
|
||||||
let sub = add_trie trie_empty s (i+1) p in
|
|
||||||
TrieNode (CharMap.add c sub map)
|
|
||||||
in
|
|
||||||
let trie =
|
|
||||||
List.fold_left
|
|
||||||
(fun trie (s, p) ->
|
|
||||||
if s = "" then invalid_arg "switch_s: empty string";
|
|
||||||
add_trie trie s 0 p
|
|
||||||
) trie_empty l
|
|
||||||
in
|
|
||||||
parser_of_trie trie
|
|
||||||
|
|
||||||
let bool =
|
|
||||||
switch_s
|
|
||||||
[ "true", return true
|
|
||||||
; "false", return false
|
|
||||||
]
|
|
||||||
|
|
||||||
let fix f =
|
|
||||||
(* outermost lazy needed for the recursive definition *)
|
|
||||||
let rec r = {
|
|
||||||
st=Parse (Lazy (lazy (f r)));
|
|
||||||
} in
|
|
||||||
r
|
|
||||||
|
|
||||||
module Infix = struct
|
|
||||||
let (>|=) x f = map f x
|
|
||||||
let (<*>) = app
|
|
||||||
let (<<) = app_left
|
|
||||||
let (>>) = app_right
|
|
||||||
let (<+>) a b = choice [a; b]
|
|
||||||
let (<::>) a b = pure (fun x l -> x::l) <*> a <*> b
|
|
||||||
end
|
|
||||||
|
|
||||||
include Infix
|
|
||||||
|
|
||||||
let word =
|
|
||||||
pure (fun c s -> str_of_l (c :: s)) <*> alpha <*> many alpha_num
|
|
||||||
|
|
||||||
let quoted =
|
|
||||||
let q = char '"' in
|
|
||||||
let escaped = char '\\' >> char '"' in
|
|
||||||
let inner = choice [escaped; alpha_num; any_of "()' \t\n|!;$#@%&-_/=~.,:<>[]"] in
|
|
||||||
q >> (many inner >|= str_of_l) << q
|
|
||||||
|
|
||||||
(** {2 Compilation} *)
|
|
||||||
|
|
||||||
let encode_cons x sep tl = pure (fun x _sep tl -> x :: tl) <*> x <*> sep <*> tl
|
|
||||||
|
|
||||||
let encode_many
|
|
||||||
: type a. set:CharSet.t -> p:a t -> self:a list t -> sep:unit t -> a list t
|
|
||||||
= fun ~set ~p ~self ~sep ->
|
|
||||||
let on_success = encode_cons p sep self
|
|
||||||
and on_fail = pure [] in
|
|
||||||
make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail)
|
|
||||||
|
|
||||||
let encode_opt ~set x =
|
|
||||||
let mk_one x = [x] in
|
|
||||||
let on_success = make_c (C_Map (mk_one, x))
|
|
||||||
and on_fail = pure [] in
|
|
||||||
make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail)
|
|
||||||
|
|
||||||
let encode_skip
|
|
||||||
: type a. set:CharSet.t -> p:a t -> self:unit t -> unit t
|
|
||||||
= fun ~set ~p ~self ->
|
|
||||||
let on_success = p >> self
|
|
||||||
and on_fail = pure () in
|
|
||||||
make_switch_c (map_add_set CharMap.empty set on_success) (Some on_fail)
|
|
||||||
|
|
||||||
let many_
|
|
||||||
: type a. sep:unit t -> mult:multiplicity -> p:a t -> a list t
|
|
||||||
= fun ~sep ~mult ~p -> match possible_first_chars p with
|
|
||||||
| Set set ->
|
|
||||||
begin match mult with
|
|
||||||
| Star -> fix (fun self -> encode_many ~set ~sep ~p ~self)
|
|
||||||
| Plus -> encode_cons p sep (fix (fun self -> encode_many ~set ~sep ~p ~self))
|
|
||||||
| Question -> encode_opt ~set p
|
|
||||||
end
|
|
||||||
| IsFail msg -> fail msg
|
|
||||||
| NoCharOrSet _ -> invalid_arg (str "many: invalid parser (might not consume input)")
|
|
||||||
| AllChars -> invalid_arg (str "many: invalid parser (always succeeds)")
|
|
||||||
| NoChar -> invalid_arg (str "many: invalid parser (does not consume input)")
|
|
||||||
|
|
||||||
let skip_ : type a. mult:multiplicity -> p:a t -> unit t
|
|
||||||
= fun ~mult ~p -> match possible_first_chars p with
|
|
||||||
| Set set ->
|
|
||||||
begin match mult with
|
|
||||||
| Star -> fix (fun self -> encode_skip ~set ~p ~self)
|
|
||||||
| Plus -> p >> fix (fun self -> encode_skip ~set ~p ~self)
|
|
||||||
| Question -> encode_opt ~set p >> pure ()
|
|
||||||
end
|
|
||||||
| IsFail msg -> fail msg
|
|
||||||
| NoCharOrSet _ -> invalid_arg (str "many: invalid parser (might not consume input)")
|
|
||||||
| AllChars -> invalid_arg (str "skip: invalid parser (always succeeds)")
|
|
||||||
| NoChar -> invalid_arg (str "skip: invalid parser (does not consume input)")
|
|
||||||
|
|
||||||
let rec compile
|
|
||||||
: type a. a t -> a compiled
|
|
||||||
= fun t -> match t.st with
|
|
||||||
| Compiled c -> c (* already compiled *)
|
|
||||||
| Parse (Many (p, sep, mult)) ->
|
|
||||||
let c = compile (many_ ~sep ~mult ~p) in
|
|
||||||
t.st <- Compiled c;
|
|
||||||
c
|
|
||||||
| Parse (Skip (p, mult)) ->
|
|
||||||
let c = compile (skip_ ~mult ~p) in
|
|
||||||
t.st <- Compiled c;
|
|
||||||
c
|
|
||||||
| Parse (Lazy (lazy p)) ->
|
|
||||||
let c = compile p in
|
|
||||||
t.st <- Compiled c;
|
|
||||||
c
|
|
||||||
|
|
||||||
(** {2 Signatures} *)
|
|
||||||
|
|
||||||
type error = {
|
|
||||||
line: int;
|
|
||||||
col: int;
|
|
||||||
msg: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
let string_of_error e = str "at %d:%d; %s" e.line e.col e.msg
|
|
||||||
|
|
||||||
exception Error of error
|
|
||||||
|
|
||||||
module type S = sig
|
|
||||||
type source
|
|
||||||
(** Source of characters *)
|
|
||||||
|
|
||||||
val parse : source -> 'a t -> ('a, error) result
|
|
||||||
(** Parse the given source using the parser, and returns the parsed value. *)
|
|
||||||
|
|
||||||
val parse': source -> 'a t -> ('a, string) result
|
|
||||||
(** Same as {!parse}, but returns a user-friendly string in case of failure *)
|
|
||||||
|
|
||||||
val parse_exn : source -> 'a t -> 'a
|
|
||||||
(** Unsafe version of {!parse}.
|
|
||||||
@raise Error if parsing fails *)
|
|
||||||
end
|
|
||||||
|
|
||||||
(** {2 Build a parser from a given Monadic Input} *)
|
|
||||||
|
|
||||||
module type INPUT = sig
|
|
||||||
type t
|
|
||||||
|
|
||||||
val read : t -> Bytes.t -> int -> int -> int
|
|
||||||
end
|
|
||||||
|
|
||||||
type token =
|
|
||||||
| Yield of char
|
|
||||||
| EOF
|
|
||||||
|
|
||||||
module type READER = sig
|
|
||||||
type t
|
|
||||||
type source
|
|
||||||
|
|
||||||
val create : source -> t
|
|
||||||
val peek : t -> token (* peek; do not consume *)
|
|
||||||
val next : t -> token (* read and consume *)
|
|
||||||
val junk : t -> unit (* consume last token, obtained with junk *)
|
|
||||||
val line : t -> int
|
|
||||||
val col : t -> int
|
|
||||||
end
|
|
||||||
|
|
||||||
module ReaderOfInput(I : INPUT) : READER with type source = I.t = struct
|
|
||||||
type t = {
|
|
||||||
mutable rline : int;
|
|
||||||
mutable rcol : int;
|
|
||||||
input : I.t;
|
|
||||||
buf : Bytes.t;
|
|
||||||
mutable i : int;
|
|
||||||
mutable len : int;
|
|
||||||
}
|
|
||||||
type source = I.t
|
|
||||||
|
|
||||||
let line t = t.rline
|
|
||||||
let col t = t.rcol
|
|
||||||
|
|
||||||
let create input = {
|
|
||||||
rline=1;
|
|
||||||
rcol=0;
|
|
||||||
input;
|
|
||||||
buf = Bytes.make 1024 ' ';
|
|
||||||
i=1;
|
|
||||||
len=1; (* trick for initialization *)
|
|
||||||
}
|
|
||||||
|
|
||||||
let read_next t =
|
|
||||||
let c = Bytes.get t.buf t.i in
|
|
||||||
t.i <- t.i + 1;
|
|
||||||
if c = '\n' then (
|
|
||||||
t.rcol <- 0;
|
|
||||||
t.rline <- t.rline + 1;
|
|
||||||
) else (
|
|
||||||
t.rcol <- t.rcol + 1
|
|
||||||
);
|
|
||||||
Yield c
|
|
||||||
|
|
||||||
let refill t =
|
|
||||||
t.len <- I.read t.input t.buf 0 (Bytes.length t.buf);
|
|
||||||
t.i <- 0;
|
|
||||||
()
|
|
||||||
|
|
||||||
let next t =
|
|
||||||
if t.len = 0 then EOF
|
|
||||||
else if t.i = t.len
|
|
||||||
then (
|
|
||||||
refill t;
|
|
||||||
if t.len = 0 then EOF else read_next t
|
|
||||||
) else read_next t
|
|
||||||
|
|
||||||
let peek t =
|
|
||||||
if t.i = t.len
|
|
||||||
then refill t;
|
|
||||||
Yield (Bytes.get t.buf t.i)
|
|
||||||
|
|
||||||
let junk t =
|
|
||||||
assert (t.len > 0 && t.i < t.len);
|
|
||||||
t.i <- t.i + 1
|
|
||||||
end
|
|
||||||
|
|
||||||
module MakeFromReader(R : READER) : S with type source = R.source = struct
|
|
||||||
type source = R.source
|
|
||||||
|
|
||||||
let error r msg =
|
|
||||||
raise (Error {
|
|
||||||
line = R.line r;
|
|
||||||
col = R.col r;
|
|
||||||
msg;
|
|
||||||
})
|
|
||||||
let errorf r fmt =
|
|
||||||
Printf.ksprintf
|
|
||||||
(fun msg -> error r msg)
|
|
||||||
fmt
|
|
||||||
|
|
||||||
let is_int c = Char.code c >= Char.code '0' && Char.code c <= Char.code '9'
|
|
||||||
let to_int c = Char.code c - Char.code '0'
|
|
||||||
|
|
||||||
let rec parse_int r ~sign i = match R.peek r with
|
|
||||||
| EOF -> i
|
|
||||||
| Yield c when is_int c ->
|
|
||||||
R.junk r;
|
|
||||||
parse_int r ~sign (10 * i + to_int c)
|
|
||||||
| Yield '-' when i = 0 && sign ->
|
|
||||||
(* switch sign: only on first char *)
|
|
||||||
R.junk r;
|
|
||||||
parse_int r ~sign:false 0
|
|
||||||
| _ -> if sign then i else -i
|
|
||||||
|
|
||||||
let parse_float _r _buf = assert false
|
|
||||||
|
|
||||||
let rec parse_rec : type a. R.t -> a t -> a =
|
|
||||||
fun r p -> match compile p with
|
|
||||||
| C_Return x -> x
|
|
||||||
| C_Map (f, x) ->
|
|
||||||
let y = parse_rec r x in
|
|
||||||
f y
|
|
||||||
| C_Filter (f, x) ->
|
|
||||||
let y = parse_rec r x in
|
|
||||||
if f y then y else errorf r "filter failed"
|
|
||||||
| C_App (f, x) ->
|
|
||||||
let f' = parse_rec r f in
|
|
||||||
let x' = parse_rec r x in
|
|
||||||
f' x'
|
|
||||||
| C_AppLeft (a, b) ->
|
|
||||||
let a' = parse_rec r a in
|
|
||||||
let _ = parse_rec r b in
|
|
||||||
a'
|
|
||||||
| C_AppRight (a, b) ->
|
|
||||||
let _ = parse_rec r a in
|
|
||||||
let b' = parse_rec r b in
|
|
||||||
b'
|
|
||||||
| C_Fail msg -> error r msg
|
|
||||||
| C_Int -> parse_int r ~sign:true 0
|
|
||||||
| C_Float -> parse_float r (Buffer.create 8)
|
|
||||||
| C_Junk -> R.junk r
|
|
||||||
| C_AnyOf set ->
|
|
||||||
begin match R.next r with
|
|
||||||
| EOF -> errorf r "expected any of %s, got EOF" (print_char_set set)
|
|
||||||
| Yield c ->
|
|
||||||
if CharSet.mem c set then c
|
|
||||||
else errorf r "expected any of %s, got '%s'" (print_char_set set) (print_char c)
|
|
||||||
end
|
|
||||||
| C_SwitchC (map, def) ->
|
|
||||||
begin match R.peek r with
|
|
||||||
| EOF -> errorf r "expected any of %s, got EOF" (print_char_map map)
|
|
||||||
| Yield c ->
|
|
||||||
begin try
|
|
||||||
let p' = CharMap.find c map in
|
|
||||||
parse_rec r p'
|
|
||||||
with Not_found -> match def with
|
|
||||||
| None ->
|
|
||||||
errorf r "expected any of %s, got %c" (print_char_map map) c
|
|
||||||
| Some d -> parse_rec r d
|
|
||||||
end
|
|
||||||
end
|
|
||||||
| C_Eof ->
|
|
||||||
begin match R.next r with
|
|
||||||
| EOF -> ()
|
|
||||||
| Yield c -> errorf r "expected EOF, got %c" c
|
|
||||||
end
|
|
||||||
|
|
||||||
(* public functions *)
|
|
||||||
let parse_exn src p =
|
|
||||||
let r = R.create src in
|
|
||||||
parse_rec r p
|
|
||||||
|
|
||||||
let parse src p =
|
|
||||||
let r = R.create src in
|
|
||||||
try
|
|
||||||
`Ok (parse_rec r p)
|
|
||||||
with Error e ->
|
|
||||||
`Error e
|
|
||||||
|
|
||||||
let parse' src p = match parse src p with
|
|
||||||
| `Ok x -> `Ok x
|
|
||||||
| `Error e -> `Error (string_of_error e)
|
|
||||||
end
|
|
||||||
|
|
||||||
module Make(I : INPUT) = struct
|
|
||||||
module R = ReaderOfInput(I)
|
|
||||||
include MakeFromReader(R)
|
|
||||||
end
|
|
||||||
|
|
||||||
module Str = MakeFromReader(struct
|
|
||||||
(* reader of string *)
|
|
||||||
type t = {
|
|
||||||
str : string;
|
|
||||||
mutable i : int;
|
|
||||||
mutable rcol : int;
|
|
||||||
mutable rline : int;
|
|
||||||
}
|
|
||||||
type source = string
|
|
||||||
|
|
||||||
let create str = {
|
|
||||||
str;
|
|
||||||
i = 0;
|
|
||||||
rcol = 1;
|
|
||||||
rline = 1;
|
|
||||||
}
|
|
||||||
let line t = t.rline
|
|
||||||
let col t = t.rcol
|
|
||||||
let peek t =
|
|
||||||
if t.i = String.length t.str then EOF else Yield (String.get t.str t.i)
|
|
||||||
let junk t =
|
|
||||||
assert (t.i < String.length t.str);
|
|
||||||
t.i <- t.i + 1
|
|
||||||
let next t =
|
|
||||||
if t.i = String.length t.str then EOF
|
|
||||||
else (
|
|
||||||
let c = String.get t.str t.i in
|
|
||||||
t.i <- t.i + 1;
|
|
||||||
if c = '\n' then (
|
|
||||||
t.rcol <- 1;
|
|
||||||
t.rline <- t.rline + 1
|
|
||||||
) else t.rcol <- t.rcol + 1;
|
|
||||||
Yield c
|
|
||||||
)
|
|
||||||
end)
|
|
||||||
|
|
||||||
module Chan = Make(struct
|
|
||||||
type t = in_channel
|
|
||||||
let read = input
|
|
||||||
end)
|
|
||||||
|
|
@ -1,272 +0,0 @@
|
||||||
(*
|
|
||||||
copyright (c) 2013-2015, simon cruanes
|
|
||||||
all rights reserved.
|
|
||||||
|
|
||||||
redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are met:
|
|
||||||
|
|
||||||
redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer. redistributions in binary
|
|
||||||
form must reproduce the above copyright notice, this list of conditions and the
|
|
||||||
following disclaimer in the documentation and/or other materials provided with
|
|
||||||
the distribution.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
||||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*)
|
|
||||||
|
|
||||||
(** {1 Applicative Parser Combinators}
|
|
||||||
|
|
||||||
Example: basic S-expr parser
|
|
||||||
|
|
||||||
{[
|
|
||||||
open Containers_string.App_parse;;
|
|
||||||
|
|
||||||
type sexp = Atom of string | List of sexp list;;
|
|
||||||
|
|
||||||
let mkatom a = Atom a;;
|
|
||||||
let mklist l = List l;;
|
|
||||||
|
|
||||||
let ident_char = alpha_num <+> any_of "|!;$#@%&-_/=*.:~+[]<>'" ;;
|
|
||||||
let ident = many1 ident_char >|= str_of_l ;;
|
|
||||||
let atom = (ident <+> quoted) >|= mkatom ;;
|
|
||||||
|
|
||||||
let sexp = fix (fun sexp ->
|
|
||||||
white >>
|
|
||||||
(atom <+>
|
|
||||||
((char '(' >> many sexp << char ')') >|= mklist)
|
|
||||||
)
|
|
||||||
);;
|
|
||||||
|
|
||||||
Str.parse_exn "(a (b c d) e)" sexp;;
|
|
||||||
|
|
||||||
]}
|
|
||||||
|
|
||||||
@deprecated CCParse is more expressive and stable
|
|
||||||
|
|
||||||
{b status: deprecated}
|
|
||||||
@since 0.10
|
|
||||||
*)
|
|
||||||
|
|
||||||
type ('a,'b) result = [`Error of 'b | `Ok of 'a]
|
|
||||||
|
|
||||||
type 'a t
|
|
||||||
(** Parser that yields an error or a value of type 'a *)
|
|
||||||
|
|
||||||
(** {6 Combinators} *)
|
|
||||||
|
|
||||||
val return : 'a -> 'a t
|
|
||||||
(** Parser that succeeds with the given value *)
|
|
||||||
|
|
||||||
val pure : 'a -> 'a t
|
|
||||||
(** Synonym to {!return} *)
|
|
||||||
|
|
||||||
val junk : unit t
|
|
||||||
(** Skip next char *)
|
|
||||||
|
|
||||||
val fail : string -> 'a t
|
|
||||||
(** [fail msg] fails with the given error message *)
|
|
||||||
|
|
||||||
val failf : ('a, unit, string, 'b t) format4 -> 'a
|
|
||||||
|
|
||||||
val app : ('a -> 'b) t -> 'a t -> 'b t
|
|
||||||
(** Applicative *)
|
|
||||||
|
|
||||||
val map : ('a -> 'b) -> 'a t -> 'b t
|
|
||||||
(** Map the parsed value *)
|
|
||||||
|
|
||||||
val int : int t
|
|
||||||
(** Parse an integer *)
|
|
||||||
|
|
||||||
val float : float t
|
|
||||||
(** Parse a floating point number *)
|
|
||||||
|
|
||||||
val bool : bool t
|
|
||||||
(** Parse "true" or "false" *)
|
|
||||||
|
|
||||||
val char : char -> char t
|
|
||||||
(** [char c] parses [c] and [c] only *)
|
|
||||||
|
|
||||||
val any_of : string -> char t
|
|
||||||
(** Parse any of the chars present in the given string *)
|
|
||||||
|
|
||||||
val alpha_lower : char t
|
|
||||||
|
|
||||||
val alpha_upper : char t
|
|
||||||
|
|
||||||
val alpha : char t
|
|
||||||
|
|
||||||
val symbols : char t
|
|
||||||
(** Symbols, such as "!-=_"... *)
|
|
||||||
|
|
||||||
val num : char t
|
|
||||||
|
|
||||||
val alpha_num : char t
|
|
||||||
|
|
||||||
val word : string t
|
|
||||||
(** [word] parses any identifier not starting with an integer and
|
|
||||||
not containing any whitespace nor delimiter
|
|
||||||
TODO: specify *)
|
|
||||||
|
|
||||||
val quoted : string t
|
|
||||||
(** Quoted string, following OCaml conventions *)
|
|
||||||
|
|
||||||
val str_of_l : char list -> string
|
|
||||||
(** Helper to build strings from lists of chars *)
|
|
||||||
|
|
||||||
val spaces : unit t
|
|
||||||
(** Parse a sequence of ['\t'] and [' '] *)
|
|
||||||
|
|
||||||
val spaces1 : unit t
|
|
||||||
(** Same as {!spaces} but requires at least one space *)
|
|
||||||
|
|
||||||
val white : unit t
|
|
||||||
(** Parse a sequence of ['\t'], ['\n'] and [' '] *)
|
|
||||||
|
|
||||||
val white1 : unit t
|
|
||||||
|
|
||||||
val eof : unit t
|
|
||||||
(** Matches the end of input, fails otherwise *)
|
|
||||||
|
|
||||||
val many : ?sep:unit t -> 'a t -> 'a list t
|
|
||||||
(** 0 or more parsed elements of the given type.
|
|
||||||
@param sep separator between elements of the list (for instance, {!space}) *)
|
|
||||||
|
|
||||||
val many1 : ?sep:unit t -> 'a t -> 'a list t
|
|
||||||
(** Same as {!many}, but needs at least one element *)
|
|
||||||
|
|
||||||
val skip : _ t -> unit t
|
|
||||||
(** Skip 0 or more instances of the given parser *)
|
|
||||||
|
|
||||||
val skip1 : _ t -> unit t
|
|
||||||
|
|
||||||
val opt : 'a t -> 'a option t
|
|
||||||
(** [opt x] tries to parse [x], and returns [None] otherwise *)
|
|
||||||
|
|
||||||
val filter : ('a -> bool) -> 'a t -> 'a t
|
|
||||||
(** [filter f p] parses the same as [p], but fails if the returned value
|
|
||||||
does not satisfy [f] *)
|
|
||||||
|
|
||||||
|
|
||||||
(* TODO: complement operator any_but (all but \, for instance) *)
|
|
||||||
(* TODO: a "if-then-else" combinator (assuming the test has a
|
|
||||||
set of possible first chars) *)
|
|
||||||
|
|
||||||
val switch_c : ?default:'a t -> (char * 'a t) list -> 'a t
|
|
||||||
(** [switch_c l] matches the next char and uses the corresponding parser.
|
|
||||||
Fails if the next char is not in the list, unless default is defined.
|
|
||||||
@param default parser to use if no char matches
|
|
||||||
@raise Invalid_argument if some char occurs several times in [l] *)
|
|
||||||
|
|
||||||
val switch_s : (string * 'a t) list -> 'a t
|
|
||||||
(** [switch_s l] attempts to match matches any of the strings in [l].
|
|
||||||
If one of those strings matches, the corresponding parser
|
|
||||||
is used from now on.
|
|
||||||
@raise Invalid_argument if some string is a prefix of another string,
|
|
||||||
or is empty, or if the list is empty *)
|
|
||||||
|
|
||||||
val choice : 'a t list -> 'a t
|
|
||||||
(** [choice l] chooses between the parsers, unambiguously
|
|
||||||
@raise Invalid_argument if the list is empty, or if some parsers
|
|
||||||
overlap, making the choice ambiguous *)
|
|
||||||
|
|
||||||
val fix : ('a t -> 'a t) -> 'a t
|
|
||||||
(** [fix f] makes a fixpoint *)
|
|
||||||
|
|
||||||
module Infix : sig
|
|
||||||
val (>|=) : 'a t -> ('a -> 'b) -> 'b t
|
|
||||||
(** Infix version of {!map} *)
|
|
||||||
|
|
||||||
val (<*>) : ('a -> 'b) t -> 'a t -> 'b t
|
|
||||||
(** Synonym to {!app} *)
|
|
||||||
|
|
||||||
val (>>) : _ t -> 'a t -> 'a t
|
|
||||||
(** [a >> b] parses [a], ignores its result, then parses [b] *)
|
|
||||||
|
|
||||||
val (<<) : 'a t -> _ t -> 'a t
|
|
||||||
(** [a << b] parses [a], then [b], and discards [b] to return [a] *)
|
|
||||||
|
|
||||||
val (<+>) : 'a t -> 'a t -> 'a t
|
|
||||||
(** [a <+> b] is [choice [a;b]], a binary choice *)
|
|
||||||
|
|
||||||
val (<::>) : 'a t -> 'a list t -> 'a list t
|
|
||||||
(** [a <::> b] is [app (fun x l -> x::l) a b] *)
|
|
||||||
end
|
|
||||||
|
|
||||||
include module type of Infix
|
|
||||||
|
|
||||||
(** {2 Signatures} *)
|
|
||||||
|
|
||||||
(** {6 Parsing} *)
|
|
||||||
|
|
||||||
type error = {
|
|
||||||
line: int;
|
|
||||||
col: int;
|
|
||||||
msg: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
val string_of_error : error -> string
|
|
||||||
|
|
||||||
exception Error of error
|
|
||||||
|
|
||||||
module type S = sig
|
|
||||||
type source
|
|
||||||
(** Source of characters *)
|
|
||||||
|
|
||||||
val parse : source -> 'a t -> ('a, error) result
|
|
||||||
(** Parse the given source using the parser, and returns the parsed value. *)
|
|
||||||
|
|
||||||
val parse': source -> 'a t -> ('a, string) result
|
|
||||||
(** Same as {!parse}, but returns a user-friendly string in case of failure *)
|
|
||||||
|
|
||||||
val parse_exn : source -> 'a t -> 'a
|
|
||||||
(** Unsafe version of {!parse}.
|
|
||||||
@raise Error if parsing fails *)
|
|
||||||
end
|
|
||||||
|
|
||||||
(** {2 Parse} *)
|
|
||||||
|
|
||||||
module type INPUT = sig
|
|
||||||
type t
|
|
||||||
|
|
||||||
val read : t -> Bytes.t -> int -> int -> int
|
|
||||||
end
|
|
||||||
|
|
||||||
module Make(I : INPUT) : S with type source = I.t
|
|
||||||
|
|
||||||
(** {2 Low-level interface} *)
|
|
||||||
|
|
||||||
val print : Format.formatter -> _ t -> unit
|
|
||||||
(** Print a parser structure, for debug purpose *)
|
|
||||||
|
|
||||||
type token =
|
|
||||||
| Yield of char
|
|
||||||
| EOF
|
|
||||||
|
|
||||||
module type READER = sig
|
|
||||||
type t
|
|
||||||
type source (* underlying source *)
|
|
||||||
|
|
||||||
val create : source -> t
|
|
||||||
val peek : t -> token (* peek; do not consume *)
|
|
||||||
val next : t -> token (* read and consume *)
|
|
||||||
val junk : t -> unit (* consume last token, obtained with junk *)
|
|
||||||
val line : t -> int
|
|
||||||
val col : t -> int
|
|
||||||
end
|
|
||||||
|
|
||||||
module MakeFromReader(R : READER) : S with type source = R.source
|
|
||||||
|
|
||||||
(** {2 Defaults} *)
|
|
||||||
|
|
||||||
module Str : S with type source = string
|
|
||||||
|
|
||||||
module Chan : S with type source = in_channel
|
|
||||||
|
|
@ -1,823 +0,0 @@
|
||||||
(*
|
|
||||||
copyright (c) 2013, simon cruanes
|
|
||||||
all rights reserved.
|
|
||||||
|
|
||||||
redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are met:
|
|
||||||
|
|
||||||
redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer. redistributions in binary
|
|
||||||
form must reproduce the above copyright notice, this list of conditions and the
|
|
||||||
following disclaimer in the documentation and/or other materials provided with
|
|
||||||
the distribution.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
||||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*)
|
|
||||||
|
|
||||||
(** {1 Levenshtein distance} *)
|
|
||||||
|
|
||||||
type 'a sequence = ('a -> unit) -> unit
|
|
||||||
type 'a gen = unit -> 'a option
|
|
||||||
|
|
||||||
module type STRING = sig
|
|
||||||
type char_
|
|
||||||
type t
|
|
||||||
|
|
||||||
val of_list : char_ list -> t
|
|
||||||
val get : t -> int -> char_
|
|
||||||
val length : t -> int
|
|
||||||
val compare_char : char_ -> char_ -> int
|
|
||||||
end
|
|
||||||
|
|
||||||
(** Continuation list *)
|
|
||||||
type 'a klist = unit ->
|
|
||||||
[
|
|
||||||
| `Nil
|
|
||||||
| `Cons of 'a * 'a klist
|
|
||||||
]
|
|
||||||
|
|
||||||
let rec klist_to_list l = match l () with
|
|
||||||
| `Nil -> []
|
|
||||||
| `Cons (x,k) -> x :: klist_to_list k
|
|
||||||
|
|
||||||
(*$inject
|
|
||||||
open CCFun
|
|
||||||
|
|
||||||
let list_uniq_ = Q.(
|
|
||||||
let gen = Gen.(list_size (0 -- 100) (string_size ~gen:printable (1 -- 10))
|
|
||||||
>|= CCList.sort_uniq ~cmp:String.compare
|
|
||||||
>|= List.map (fun s->s,s)
|
|
||||||
) in
|
|
||||||
let print = Print.(list (pair string string)) in
|
|
||||||
let shrink = Shrink.(list ~shrink:(pair string string)) in
|
|
||||||
make ~small:List.length ~print ~shrink gen
|
|
||||||
)
|
|
||||||
*)
|
|
||||||
|
|
||||||
(*$Q
|
|
||||||
Q.(string_of_size Gen.(0 -- 30)) (fun s -> \
|
|
||||||
let a = of_string ~limit:1 s in \
|
|
||||||
match_with a s)
|
|
||||||
*)
|
|
||||||
|
|
||||||
(* test that building a from s, and mutating one char of s, yields
|
|
||||||
a string s' that is accepted by a.
|
|
||||||
|
|
||||||
--> generate triples (s, i, c) where c is a char, s a non empty string
|
|
||||||
and i a valid index in s
|
|
||||||
*)
|
|
||||||
|
|
||||||
(*$QR
|
|
||||||
(
|
|
||||||
let gen = Q.Gen.(
|
|
||||||
3 -- 10 >>= fun len ->
|
|
||||||
0 -- (len-1) >>= fun i ->
|
|
||||||
string_size (return len) >>= fun s ->
|
|
||||||
char >|= fun c -> (s,i,c)
|
|
||||||
) in
|
|
||||||
let small (s,_,_) = String.length s in
|
|
||||||
Q.make ~small gen
|
|
||||||
)
|
|
||||||
(fun (s,i,c) ->
|
|
||||||
let s' = Bytes.of_string s in
|
|
||||||
Bytes.set s' i c;
|
|
||||||
let a = of_string ~limit:1 s in
|
|
||||||
match_with a (Bytes.to_string s')
|
|
||||||
)
|
|
||||||
*)
|
|
||||||
|
|
||||||
(* test that, for an index, all retrieved strings are at a distance to
|
|
||||||
the key that is not too high *)
|
|
||||||
(*$QR & ~count:30
|
|
||||||
(
|
|
||||||
let mklist l =
|
|
||||||
let l' = List.map (fun s->s,s) l in
|
|
||||||
l, Index.of_list l'
|
|
||||||
in
|
|
||||||
let gen = Q.Gen.(
|
|
||||||
list_size (3 -- 15) (string_size (1 -- 10)) >|= mklist
|
|
||||||
) in
|
|
||||||
let small (l,_) = List.length l in
|
|
||||||
let print (l,_) = Q.Print.(list string) l in
|
|
||||||
let shrink (l,_) = Sequence.map mklist (Q.Shrink.list l) in
|
|
||||||
Q.make ~small ~print ~shrink gen
|
|
||||||
)
|
|
||||||
(fun (l,idx) ->
|
|
||||||
List.for_all
|
|
||||||
(fun s ->
|
|
||||||
let retrieved = Index.retrieve ~limit:2 idx s
|
|
||||||
|> klist_to_list in
|
|
||||||
List.for_all
|
|
||||||
(fun s' -> edit_distance s s' <= 2) retrieved &&
|
|
||||||
List.for_all
|
|
||||||
(fun s' -> not (edit_distance s s' <= 2) || List.mem s' retrieved)
|
|
||||||
l
|
|
||||||
) l
|
|
||||||
)
|
|
||||||
|
|
||||||
*)
|
|
||||||
|
|
||||||
(*$R
|
|
||||||
let idx = Index.of_list ["aa", "aa"; "ab", "ab"; "cd", "cd"; "a'c", "a'c"] in
|
|
||||||
assert_equal ~printer:Q.Print.(list string)
|
|
||||||
["a'c"; "aa"; "ab"]
|
|
||||||
(Index.retrieve ~limit:1 idx "ac" |> CCKList.to_list
|
|
||||||
|> List.sort Pervasives.compare)
|
|
||||||
*)
|
|
||||||
|
|
||||||
module type S = sig
|
|
||||||
type char_
|
|
||||||
type string_
|
|
||||||
|
|
||||||
(** {6 Edit Distance} *)
|
|
||||||
|
|
||||||
val edit_distance : string_ -> string_ -> int
|
|
||||||
(** Edition distance between two strings. This satisfies the classical
|
|
||||||
distance axioms: it is always positive, symmetric, and satisfies
|
|
||||||
the formula [distance a b + distance b c >= distance a c] *)
|
|
||||||
|
|
||||||
(** {6 Automaton}
|
|
||||||
An automaton, built from a string [s] and a limit [n], that accepts
|
|
||||||
every string that is at distance at most [n] from [s]. *)
|
|
||||||
|
|
||||||
type automaton
|
|
||||||
(** Levenshtein automaton *)
|
|
||||||
|
|
||||||
val of_string : limit:int -> string_ -> automaton
|
|
||||||
(** Build an automaton from a string, with a maximal distance [limit].
|
|
||||||
The automaton will accept strings whose {!edit_distance} to the
|
|
||||||
parameter is at most [limit]. *)
|
|
||||||
|
|
||||||
val of_list : limit:int -> char_ list -> automaton
|
|
||||||
(** Build an automaton from a list, with a maximal distance [limit] *)
|
|
||||||
|
|
||||||
val debug_print : (out_channel -> char_ -> unit) ->
|
|
||||||
out_channel -> automaton -> unit
|
|
||||||
(** Output the automaton's structure on the given channel. *)
|
|
||||||
|
|
||||||
val match_with : automaton -> string_ -> bool
|
|
||||||
(** [match_with a s] matches the string [s] against [a], and returns
|
|
||||||
[true] if the distance from [s] to the word represented by [a] is smaller
|
|
||||||
than the limit used to build [a] *)
|
|
||||||
|
|
||||||
(** {6 Index for one-to-many matching} *)
|
|
||||||
|
|
||||||
module Index : sig
|
|
||||||
type 'b t
|
|
||||||
(** Index that maps strings to values of type 'b. Internally it is
|
|
||||||
based on a trie. A string can only map to one value. *)
|
|
||||||
|
|
||||||
val empty : 'b t
|
|
||||||
(** Empty index *)
|
|
||||||
|
|
||||||
val is_empty : _ t -> bool
|
|
||||||
|
|
||||||
val add : 'b t -> string_ -> 'b -> 'b t
|
|
||||||
(** Add a pair string/value to the index. If a value was already present
|
|
||||||
for this string it is replaced. *)
|
|
||||||
|
|
||||||
val cardinal : _ t -> int
|
|
||||||
(** Number of bindings *)
|
|
||||||
|
|
||||||
val remove : 'b t -> string_ -> 'b t
|
|
||||||
(** Remove a string (and its associated value, if any) from the index. *)
|
|
||||||
|
|
||||||
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
|
|
||||||
(** Lazy list of objects associated to strings close to the query string *)
|
|
||||||
|
|
||||||
val of_list : (string_ * 'b) list -> 'b t
|
|
||||||
(** Build an index from a list of pairs of strings and values *)
|
|
||||||
|
|
||||||
val to_list : 'b t -> (string_ * 'b) list
|
|
||||||
(** Extract a list of pairs from an index *)
|
|
||||||
|
|
||||||
val add_seq : 'a t -> (string_ * 'a) sequence -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val of_seq : (string_ * 'a) sequence -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val to_seq : 'a t -> (string_ * 'a) sequence
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val add_gen : 'a t -> (string_ * 'a) gen -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val of_gen : (string_ * 'a) gen -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val to_gen : 'a t -> (string_ * 'a) gen
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
|
|
||||||
(** Fold over the stored pairs string/value *)
|
|
||||||
|
|
||||||
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
|
|
||||||
(** Iterate on the pairs *)
|
|
||||||
|
|
||||||
val to_klist : 'b t -> (string_ * 'b) klist
|
|
||||||
(** Conversion to an iterator *)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
module Make(Str : STRING)
|
|
||||||
: S with type char_ = Str.char_ and type string_ = Str.t = struct
|
|
||||||
type string_ = Str.t
|
|
||||||
type char_ = Str.char_
|
|
||||||
|
|
||||||
let edit_distance s1 s2 =
|
|
||||||
if Str.length s1 = 0
|
|
||||||
then Str.length s2
|
|
||||||
else if Str.length s2 = 0
|
|
||||||
then Str.length s1
|
|
||||||
else if s1 = s2
|
|
||||||
then 0
|
|
||||||
else begin
|
|
||||||
(* distance vectors (v0=previous, v1=current) *)
|
|
||||||
let v0 = Array.make (Str.length s2 + 1) 0 in
|
|
||||||
let v1 = Array.make (Str.length s2 + 1) 0 in
|
|
||||||
(* initialize v0: v0(i) = A(0)(i) = delete i chars from t *)
|
|
||||||
for i = 0 to Str.length s2 do
|
|
||||||
v0.(i) <- i
|
|
||||||
done;
|
|
||||||
(* main loop for the bottom up dynamic algorithm *)
|
|
||||||
for i = 0 to Str.length s1 - 1 do
|
|
||||||
(* first edit distance is the deletion of i+1 elements from s *)
|
|
||||||
v1.(0) <- i+1;
|
|
||||||
|
|
||||||
(* try add/delete/replace operations *)
|
|
||||||
for j = 0 to Str.length s2 - 1 do
|
|
||||||
let cost = if Str.compare_char (Str.get s1 i) (Str.get s2 j) = 0 then 0 else 1 in
|
|
||||||
v1.(j+1) <- min (v1.(j) + 1) (min (v0.(j+1) + 1) (v0.(j) + cost));
|
|
||||||
done;
|
|
||||||
|
|
||||||
(* copy v1 into v0 for next iteration *)
|
|
||||||
Array.blit v1 0 v0 0 (Str.length s2 + 1);
|
|
||||||
done;
|
|
||||||
v1.(Str.length s2)
|
|
||||||
end
|
|
||||||
|
|
||||||
module NDA = struct
|
|
||||||
type char =
|
|
||||||
| Any
|
|
||||||
| Char of char_
|
|
||||||
type transition =
|
|
||||||
| Success
|
|
||||||
| Upon of char * int * int
|
|
||||||
| Epsilon of int * int
|
|
||||||
|
|
||||||
(* non deterministic automaton *)
|
|
||||||
type _t = transition list array array
|
|
||||||
|
|
||||||
let length nda = Array.length nda
|
|
||||||
|
|
||||||
let rec mem_tr tr l = match tr, l with
|
|
||||||
| _, [] -> false
|
|
||||||
| Success, Success::_ -> true
|
|
||||||
| Epsilon (i,j), Epsilon(i',j')::_ -> i=i' && j=j'
|
|
||||||
| Upon (Any,i,j), Upon(Any,i',j')::_ when i=i' && j=j' -> true
|
|
||||||
| Upon (Char c,i,j), Upon(Char c',i',j')::_
|
|
||||||
when Str.compare_char c c' = 0 && i=i' && j=j' -> true
|
|
||||||
| _, _::l' -> mem_tr tr l'
|
|
||||||
|
|
||||||
(* build NDA from the string *)
|
|
||||||
let make ~limit s =
|
|
||||||
let len = Str.length s in
|
|
||||||
let m = Array.make_matrix (len +1) (limit+1) [] in
|
|
||||||
let add_transition i j tr =
|
|
||||||
if not (mem_tr tr m.(i).(j))
|
|
||||||
then m.(i).(j) <- tr :: m.(i).(j)
|
|
||||||
in
|
|
||||||
(* internal transitions *)
|
|
||||||
for i = 0 to len-1 do
|
|
||||||
for j = 0 to limit do
|
|
||||||
(* correct char *)
|
|
||||||
add_transition i j (Upon (Char (Str.get s i), i+1, j));
|
|
||||||
(* other transitions *)
|
|
||||||
if j < limit then begin
|
|
||||||
(* substitution *)
|
|
||||||
add_transition i j (Upon (Any, i+1, j+1));
|
|
||||||
(* deletion in indexed string *)
|
|
||||||
add_transition i j (Upon (Any, i, j+1));
|
|
||||||
(* addition to indexed string *)
|
|
||||||
add_transition i j (Epsilon (i+1, j+1));
|
|
||||||
end
|
|
||||||
done
|
|
||||||
done;
|
|
||||||
for j = 0 to limit do
|
|
||||||
(* deletions at the end *)
|
|
||||||
if j < limit
|
|
||||||
then add_transition len j (Upon (Any, len, j+1));
|
|
||||||
(* win in any case *)
|
|
||||||
add_transition len j Success;
|
|
||||||
done;
|
|
||||||
m
|
|
||||||
|
|
||||||
let get nda (i,j) =
|
|
||||||
nda.(i).(j)
|
|
||||||
|
|
||||||
let is_final nda (i,j) =
|
|
||||||
List.exists
|
|
||||||
(function Success -> true | _ -> false)
|
|
||||||
(get nda (i,j))
|
|
||||||
end
|
|
||||||
|
|
||||||
(** deterministic automaton *)
|
|
||||||
module DFA = struct
|
|
||||||
type t = {
|
|
||||||
mutable transitions : (char_ * int) list array;
|
|
||||||
mutable is_final : bool array;
|
|
||||||
mutable otherwise : int array; (* transition by default *)
|
|
||||||
mutable len : int;
|
|
||||||
}
|
|
||||||
|
|
||||||
let create size = {
|
|
||||||
len = 0;
|
|
||||||
transitions = Array.make size [];
|
|
||||||
is_final = Array.make size false;
|
|
||||||
otherwise = Array.make size ~-1;
|
|
||||||
}
|
|
||||||
|
|
||||||
let _double_array ~init a =
|
|
||||||
let a' = Array.make (2 * Array.length a) init in
|
|
||||||
Array.blit a 0 a' 0 (Array.length a);
|
|
||||||
a'
|
|
||||||
|
|
||||||
(* add a new state *)
|
|
||||||
let add_state dfa =
|
|
||||||
let n = dfa.len in
|
|
||||||
(* resize *)
|
|
||||||
if n = Array.length dfa.transitions then begin
|
|
||||||
dfa.transitions <- _double_array ~init:[] dfa.transitions;
|
|
||||||
dfa.is_final <- _double_array ~init:false dfa.is_final;
|
|
||||||
dfa.otherwise <- _double_array ~init:~-1 dfa.otherwise;
|
|
||||||
end;
|
|
||||||
dfa.len <- n + 1;
|
|
||||||
n
|
|
||||||
|
|
||||||
let rec __mem_tr tr l = match tr, l with
|
|
||||||
| _, [] -> false
|
|
||||||
| (c,i), (c',i')::l' ->
|
|
||||||
(i=i' && compare c c' = 0)
|
|
||||||
|| __mem_tr tr l'
|
|
||||||
|
|
||||||
(* add transition *)
|
|
||||||
let add_transition dfa i tr =
|
|
||||||
if not (__mem_tr tr dfa.transitions.(i))
|
|
||||||
then dfa.transitions.(i) <- tr :: dfa.transitions.(i)
|
|
||||||
|
|
||||||
let add_otherwise dfa i j =
|
|
||||||
dfa.otherwise.(i) <- j
|
|
||||||
|
|
||||||
let set_final dfa i =
|
|
||||||
dfa.is_final.(i) <- true
|
|
||||||
|
|
||||||
(* set of pairs of ints: used for representing a set of states of the NDA *)
|
|
||||||
module NDAStateSet = Set.Make(struct
|
|
||||||
type t = int * int
|
|
||||||
let compare = Pervasives.compare
|
|
||||||
end)
|
|
||||||
|
|
||||||
let _set_to_string s =
|
|
||||||
let b = Buffer.create 15 in
|
|
||||||
Buffer.add_char b '{';
|
|
||||||
NDAStateSet.iter
|
|
||||||
(fun (x,y) -> Printf.bprintf b "(%d,%d)" x y)
|
|
||||||
s;
|
|
||||||
Buffer.add_char b '}';
|
|
||||||
Buffer.contents b
|
|
||||||
|
|
||||||
(* list of characters that can specifically be followed from the given set *)
|
|
||||||
let chars_from_set nda set =
|
|
||||||
NDAStateSet.fold
|
|
||||||
(fun state acc ->
|
|
||||||
let transitions = NDA.get nda state in
|
|
||||||
List.fold_left
|
|
||||||
(fun acc tr -> match tr with
|
|
||||||
| NDA.Upon (NDA.Char c, _, _) ->
|
|
||||||
if List.exists (fun c' -> Str.compare_char c c' = 0) acc
|
|
||||||
then acc
|
|
||||||
else c :: acc (* new char! *)
|
|
||||||
| _ -> acc
|
|
||||||
) acc transitions
|
|
||||||
) set []
|
|
||||||
|
|
||||||
(* saturate current set w.r.t epsilon links *)
|
|
||||||
let saturate_epsilon nda set =
|
|
||||||
let q = Queue.create () in
|
|
||||||
NDAStateSet.iter (fun s -> Queue.push s q) set;
|
|
||||||
let set = ref set in
|
|
||||||
while not (Queue.is_empty q) do
|
|
||||||
let state = Queue.pop q in
|
|
||||||
(*Printf.printf "saturate epsilon: add state %d,%d\n" (fst state)(snd state);*)
|
|
||||||
set := NDAStateSet.add state !set;
|
|
||||||
List.iter
|
|
||||||
(fun tr' -> match tr' with
|
|
||||||
| NDA.Epsilon (i,j) ->
|
|
||||||
if not (NDAStateSet.mem (i,j) !set)
|
|
||||||
then Queue.push (i,j) q
|
|
||||||
| _ -> ()
|
|
||||||
) (NDA.get nda state)
|
|
||||||
done;
|
|
||||||
!set
|
|
||||||
|
|
||||||
(* find the transition that matches the given char (if any), or "*";
|
|
||||||
may raise exceptions Not_found or LeadToSuccess. *)
|
|
||||||
let rec get_transition_for_char nda c acc transitions =
|
|
||||||
match transitions with
|
|
||||||
| NDA.Upon (NDA.Char c', i, j) :: transitions' when Str.compare_char c c' = 0 ->
|
|
||||||
(* follow same char *)
|
|
||||||
let acc = NDAStateSet.add (i, j) acc in
|
|
||||||
get_transition_for_char nda c acc transitions'
|
|
||||||
| NDA.Upon (NDA.Any, i, j) :: transitions' ->
|
|
||||||
(* follow '*' *)
|
|
||||||
let acc = NDAStateSet.add (i,j) acc in
|
|
||||||
get_transition_for_char nda c acc transitions'
|
|
||||||
| _ :: transitions' -> get_transition_for_char nda c acc transitions'
|
|
||||||
| [] -> acc
|
|
||||||
|
|
||||||
let rec get_transitions_for_any nda acc transitions =
|
|
||||||
match transitions with
|
|
||||||
| NDA.Upon (NDA.Char _, _, _) :: transitions' ->
|
|
||||||
get_transitions_for_any nda acc transitions'
|
|
||||||
| NDA.Upon (NDA.Any, i, j) :: transitions' ->
|
|
||||||
let acc = NDAStateSet.add (i,j) acc in
|
|
||||||
get_transitions_for_any nda acc transitions'
|
|
||||||
| _:: transitions' -> get_transitions_for_any nda acc transitions'
|
|
||||||
| [] -> acc
|
|
||||||
|
|
||||||
(* follow transition for given NDA.char, returns a new state
|
|
||||||
and a boolean indicating whether it's final *)
|
|
||||||
let follow_transition nda set c =
|
|
||||||
let set' = NDAStateSet.fold
|
|
||||||
(fun state acc ->
|
|
||||||
let transitions = NDA.get nda state in
|
|
||||||
(* among possible transitions, follow the one that matches c
|
|
||||||
the most closely *)
|
|
||||||
get_transition_for_char nda c acc transitions
|
|
||||||
) set NDAStateSet.empty
|
|
||||||
in
|
|
||||||
saturate_epsilon nda set'
|
|
||||||
|
|
||||||
let follow_transition_any nda set =
|
|
||||||
let set' = NDAStateSet.fold
|
|
||||||
(fun state acc ->
|
|
||||||
let transitions = NDA.get nda state in
|
|
||||||
(* among possible transitions, follow the ones that are labelled with "*" *)
|
|
||||||
get_transitions_for_any nda acc transitions
|
|
||||||
) set NDAStateSet.empty
|
|
||||||
in
|
|
||||||
saturate_epsilon nda set'
|
|
||||||
|
|
||||||
(* call [k] with every [transition'] that can be reached from [set], with
|
|
||||||
a bool that states whether it's final *)
|
|
||||||
let iterate_transition_set nda set k =
|
|
||||||
(*Printf.printf "iterate_transition at set %s\n" (set_to_string set);*)
|
|
||||||
(* all possible "fixed char" transitions *)
|
|
||||||
let chars = chars_from_set nda set in
|
|
||||||
List.iter
|
|
||||||
(fun c ->
|
|
||||||
(*Printf.printf "iterate_transition follows %c (at %s)\n"
|
|
||||||
(Obj.magic c) (set_to_string set);*)
|
|
||||||
let set' = follow_transition nda set c in
|
|
||||||
if not (NDAStateSet.is_empty set')
|
|
||||||
then k (NDA.Char c) set';
|
|
||||||
) chars;
|
|
||||||
(* remaining transitions, with only "Any" *)
|
|
||||||
(*Printf.printf "iterate transition follows * (at %s)\n" (set_to_string set);*)
|
|
||||||
let set' = follow_transition_any nda set in
|
|
||||||
if not (NDAStateSet.is_empty set')
|
|
||||||
then k NDA.Any set'
|
|
||||||
|
|
||||||
module StateSetMap = Map.Make(NDAStateSet)
|
|
||||||
|
|
||||||
(* get the state that corresponds to the given set of NDA states.
|
|
||||||
[states] is a map [nda set] -> [nfa state] *)
|
|
||||||
let get_state dfa states set =
|
|
||||||
try StateSetMap.find set !states
|
|
||||||
with Not_found ->
|
|
||||||
let i = add_state dfa in
|
|
||||||
states := StateSetMap.add set i !states;
|
|
||||||
i
|
|
||||||
|
|
||||||
(* traverse the NDA. Currently we're at [set] *)
|
|
||||||
let rec traverse nda dfa states set =
|
|
||||||
let set_i = get_state dfa states set in
|
|
||||||
(* does this set lead to success? *)
|
|
||||||
let is_final = NDAStateSet.exists (NDA.is_final nda) set in
|
|
||||||
if is_final
|
|
||||||
then set_final dfa set_i;
|
|
||||||
iterate_transition_set nda set
|
|
||||||
(fun c set' ->
|
|
||||||
(*Printf.printf "traverse %s --%c--> %s\n" (set_to_string set)
|
|
||||||
(match c with NDA.Char c' -> Obj.magic c' | NDA.Any -> '*')
|
|
||||||
(set_to_string set');*)
|
|
||||||
let set_i' = get_state dfa states set' in
|
|
||||||
(* link set -> set' *)
|
|
||||||
match c with
|
|
||||||
| NDA.Char c' ->
|
|
||||||
add_transition dfa set_i (c', set_i');
|
|
||||||
traverse nda dfa states set'
|
|
||||||
| NDA.Any ->
|
|
||||||
add_otherwise dfa set_i set_i';
|
|
||||||
traverse nda dfa states set'
|
|
||||||
)
|
|
||||||
|
|
||||||
let of_nda nda =
|
|
||||||
let dfa = create (NDA.length nda) in
|
|
||||||
(* map (set of NDA states) to int (state in DFA) *)
|
|
||||||
let states = ref StateSetMap.empty in
|
|
||||||
(* traverse the NDA to build the NFA *)
|
|
||||||
let set = NDAStateSet.singleton (0,0) in
|
|
||||||
let set = saturate_epsilon nda set in
|
|
||||||
traverse nda dfa states set;
|
|
||||||
(*StateSetMap.iter
|
|
||||||
(fun set i ->
|
|
||||||
Printf.printf "set %s --> state %d\n" (set_to_string set) i
|
|
||||||
) !states;*)
|
|
||||||
dfa
|
|
||||||
|
|
||||||
let get dfa i =
|
|
||||||
dfa.transitions.(i)
|
|
||||||
|
|
||||||
let otherwise dfa i =
|
|
||||||
dfa.otherwise.(i)
|
|
||||||
|
|
||||||
let is_final dfa i =
|
|
||||||
dfa.is_final.(i)
|
|
||||||
end
|
|
||||||
|
|
||||||
let debug_print pp_char oc dfa =
|
|
||||||
Printf.fprintf oc "automaton of %d states\n" dfa.DFA.len;
|
|
||||||
for i = 0 to dfa.DFA.len-1 do
|
|
||||||
let transitions = DFA.get dfa i in
|
|
||||||
if DFA.is_final dfa i
|
|
||||||
then Printf.fprintf oc " success %d\n" i;
|
|
||||||
List.iter
|
|
||||||
(fun (c, j) -> Printf.fprintf oc " %d --%a--> %d\n" i pp_char c j ) transitions;
|
|
||||||
let o = DFA.otherwise dfa i in
|
|
||||||
if o >= 0
|
|
||||||
then Printf.fprintf oc " %d --*--> %d\n" i o
|
|
||||||
done
|
|
||||||
|
|
||||||
type automaton = DFA.t
|
|
||||||
|
|
||||||
let of_string ~limit s =
|
|
||||||
let nda = NDA.make ~limit s in
|
|
||||||
let dfa = DFA.of_nda nda in
|
|
||||||
dfa
|
|
||||||
|
|
||||||
let of_list ~limit l =
|
|
||||||
of_string ~limit (Str.of_list l)
|
|
||||||
|
|
||||||
let rec __find_char c l = match l with
|
|
||||||
| [] -> raise Not_found
|
|
||||||
| (c', next) :: l' ->
|
|
||||||
if compare c c' = 0
|
|
||||||
then next
|
|
||||||
else __find_char c l'
|
|
||||||
|
|
||||||
(* transition for [c] in state [i] of [dfa];
|
|
||||||
@raise Not_found if no transition matches *)
|
|
||||||
let __transition dfa i c =
|
|
||||||
let transitions = DFA.get dfa i in
|
|
||||||
try
|
|
||||||
__find_char c transitions
|
|
||||||
with Not_found ->
|
|
||||||
let o = DFA.otherwise dfa i in
|
|
||||||
if o >= 0
|
|
||||||
then o
|
|
||||||
else raise Not_found
|
|
||||||
|
|
||||||
let match_with dfa a =
|
|
||||||
let len = Str.length a in
|
|
||||||
let rec search i state =
|
|
||||||
(*Printf.printf "at state %d (dist %d)\n" i dist;*)
|
|
||||||
if i = len
|
|
||||||
then DFA.is_final dfa state
|
|
||||||
else begin
|
|
||||||
(* current char *)
|
|
||||||
let c = Str.get a i in
|
|
||||||
try
|
|
||||||
let next = __transition dfa state c in
|
|
||||||
search (i+1) next
|
|
||||||
with Not_found -> false
|
|
||||||
end
|
|
||||||
in
|
|
||||||
search 0 0
|
|
||||||
|
|
||||||
(** {6 Index for one-to-many matching} *)
|
|
||||||
|
|
||||||
module Index = struct
|
|
||||||
type key = char_
|
|
||||||
|
|
||||||
module M = Map.Make(struct
|
|
||||||
type t = key
|
|
||||||
let compare = Str.compare_char
|
|
||||||
end)
|
|
||||||
|
|
||||||
type 'b t =
|
|
||||||
| Node of 'b option * 'b t M.t
|
|
||||||
|
|
||||||
let empty = Node (None, M.empty)
|
|
||||||
|
|
||||||
let is_empty = function
|
|
||||||
| Node (None, m) -> M.is_empty m
|
|
||||||
| _ -> false
|
|
||||||
|
|
||||||
let () = assert (is_empty empty)
|
|
||||||
|
|
||||||
(** get/add/remove the leaf for the given array.
|
|
||||||
the continuation k takes the leaf, and returns a leaf option
|
|
||||||
that replaces the old leaf.
|
|
||||||
This function returns the new trie. *)
|
|
||||||
let goto_leaf s node k =
|
|
||||||
let len = Str.length s in
|
|
||||||
(* insert the value in given [node], assuming the current index
|
|
||||||
in [arr] is [i]. [k] is given the resulting tree. *)
|
|
||||||
let rec goto node i rebuild = match node with
|
|
||||||
| _ when i = len ->
|
|
||||||
let node' = k node in
|
|
||||||
rebuild node'
|
|
||||||
| Node (opt, m) ->
|
|
||||||
let c = Str.get s i in
|
|
||||||
let t' =
|
|
||||||
try M.find c m
|
|
||||||
with Not_found -> empty
|
|
||||||
in
|
|
||||||
goto t' (i+1)
|
|
||||||
(fun t'' ->
|
|
||||||
if is_empty t''
|
|
||||||
then rebuild (Node (opt, M.remove c m))
|
|
||||||
else rebuild (Node (opt, M.add c t'' m)))
|
|
||||||
in
|
|
||||||
goto node 0 (fun t -> t)
|
|
||||||
|
|
||||||
let add trie s value =
|
|
||||||
goto_leaf s trie
|
|
||||||
(function
|
|
||||||
| Node (_, m) -> Node (Some value, m))
|
|
||||||
|
|
||||||
let remove trie s =
|
|
||||||
goto_leaf s trie
|
|
||||||
(function
|
|
||||||
| Node (_, m) -> Node (None, m))
|
|
||||||
|
|
||||||
(* traverse the automaton and the idx, yielding a klist of values *)
|
|
||||||
let retrieve ~limit idx s =
|
|
||||||
let dfa = of_string ~limit s in
|
|
||||||
(* traverse at index i in automaton, with
|
|
||||||
[fk] the failure continuation *)
|
|
||||||
let rec traverse node i ~(fk:'a klist) () =
|
|
||||||
match node with
|
|
||||||
| Node (opt, m) ->
|
|
||||||
(* all alternatives: continue exploring [m], or call [fk] *)
|
|
||||||
let fk =
|
|
||||||
M.fold
|
|
||||||
(fun c node' fk ->
|
|
||||||
try
|
|
||||||
let next = __transition dfa i c in
|
|
||||||
traverse node' next ~fk
|
|
||||||
with Not_found -> fk)
|
|
||||||
m fk
|
|
||||||
in
|
|
||||||
match opt with
|
|
||||||
| Some v when DFA.is_final dfa i ->
|
|
||||||
(* yield one solution now *)
|
|
||||||
`Cons (v, fk)
|
|
||||||
| _ -> fk () (* fail... or explore subtrees *)
|
|
||||||
in
|
|
||||||
traverse idx 0 ~fk:(fun () -> `Nil)
|
|
||||||
|
|
||||||
let of_list l =
|
|
||||||
List.fold_left
|
|
||||||
(fun acc (arr,v) -> add acc arr v)
|
|
||||||
empty l
|
|
||||||
|
|
||||||
let fold f acc idx =
|
|
||||||
let rec explore acc trail node = match node with
|
|
||||||
| Node (opt, m) ->
|
|
||||||
(* first, yield current value, if any *)
|
|
||||||
let acc = match opt with
|
|
||||||
| None -> acc
|
|
||||||
| Some v ->
|
|
||||||
let str = Str.of_list (List.rev trail) in
|
|
||||||
f acc str v
|
|
||||||
in
|
|
||||||
M.fold
|
|
||||||
(fun c node' acc -> explore acc (c::trail) node')
|
|
||||||
m acc
|
|
||||||
in
|
|
||||||
explore acc [] idx
|
|
||||||
|
|
||||||
let iter f idx =
|
|
||||||
fold (fun () str v -> f str v) () idx
|
|
||||||
|
|
||||||
let cardinal idx = fold (fun n _ _ -> n+1) 0 idx
|
|
||||||
|
|
||||||
let to_list idx =
|
|
||||||
fold (fun acc str v -> (str,v) :: acc) [] idx
|
|
||||||
|
|
||||||
let add_seq i s =
|
|
||||||
let i = ref i in
|
|
||||||
s (fun (arr,v) -> i := add !i arr v);
|
|
||||||
!i
|
|
||||||
|
|
||||||
let of_seq s = add_seq empty s
|
|
||||||
|
|
||||||
let to_seq i yield = iter (fun x y -> yield (x,y)) i
|
|
||||||
|
|
||||||
(*$Q
|
|
||||||
list_uniq_ (fun l -> \
|
|
||||||
Sequence.of_list l |> Index.of_seq |> Index.to_seq \
|
|
||||||
|> Sequence.to_list |> List.sort Pervasives.compare \
|
|
||||||
= List.sort Pervasives.compare l)
|
|
||||||
*)
|
|
||||||
|
|
||||||
let rec add_gen i g = match g() with
|
|
||||||
| None -> i
|
|
||||||
| Some (arr,v) -> add_gen (add i arr v) g
|
|
||||||
|
|
||||||
let of_gen g = add_gen empty g
|
|
||||||
|
|
||||||
let to_gen s =
|
|
||||||
let st = Stack.create () in
|
|
||||||
Stack.push ([],s) st;
|
|
||||||
let rec next () =
|
|
||||||
if Stack.is_empty st then None
|
|
||||||
else
|
|
||||||
let trail, Node (opt, m) = Stack.pop st in
|
|
||||||
(* explore children *)
|
|
||||||
M.iter
|
|
||||||
(fun c node' -> Stack.push (c::trail, node') st)
|
|
||||||
m;
|
|
||||||
match opt with
|
|
||||||
| None -> next()
|
|
||||||
| Some v ->
|
|
||||||
let str = Str.of_list (List.rev trail) in
|
|
||||||
Some (str,v)
|
|
||||||
in
|
|
||||||
next
|
|
||||||
|
|
||||||
(*$Q
|
|
||||||
list_uniq_ (fun l -> \
|
|
||||||
Gen.of_list l |> Index.of_gen |> Index.to_gen \
|
|
||||||
|> Gen.to_list |> List.sort Pervasives.compare \
|
|
||||||
= List.sort Pervasives.compare l)
|
|
||||||
*)
|
|
||||||
|
|
||||||
let to_klist idx =
|
|
||||||
let rec traverse node trail ~(fk:(string_*'a) klist) () =
|
|
||||||
let Node (opt, m) = node in
|
|
||||||
(* all alternatives: continue exploring [m], or call [fk] *)
|
|
||||||
let fk =
|
|
||||||
M.fold
|
|
||||||
(fun c node' fk -> traverse node' (c::trail) ~fk)
|
|
||||||
m fk
|
|
||||||
in
|
|
||||||
match opt with
|
|
||||||
| Some v ->
|
|
||||||
let str = Str.of_list (List.rev trail) in
|
|
||||||
`Cons ((str,v), fk)
|
|
||||||
| _ -> fk () (* fail... or explore subtrees *)
|
|
||||||
in
|
|
||||||
traverse idx [] ~fk:(fun () -> `Nil)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
include Make(struct
|
|
||||||
type t = string
|
|
||||||
type char_ = char
|
|
||||||
let compare_char = Char.compare
|
|
||||||
let length = String.length
|
|
||||||
let get = String.get
|
|
||||||
let of_list l =
|
|
||||||
let buf = Buffer.create (List.length l) in
|
|
||||||
List.iter (fun c -> Buffer.add_char buf c) l;
|
|
||||||
Buffer.contents buf
|
|
||||||
end)
|
|
||||||
|
|
||||||
let debug_print = debug_print output_char
|
|
||||||
|
|
||||||
(*$T
|
|
||||||
edit_distance "foo" "fo0" = 1
|
|
||||||
edit_distance "foob" "foo" = 1
|
|
||||||
edit_distance "yolo" "yoyo" = 1
|
|
||||||
edit_distance "aaaaaaab" "aaaa" = 4
|
|
||||||
*)
|
|
||||||
|
|
||||||
(*
|
|
||||||
open Batteries;;
|
|
||||||
let words = File.with_file_in "/usr/share/dict/cracklib-small" (fun i -> IO.read_all i |> String.nsplit ~by:"\\n");;
|
|
||||||
let idx = List.fold_left (fun idx s -> Levenshtein.StrIndex.add_string idx s s) Levenshtein.StrIndex.empty words;;
|
|
||||||
Levenshtein.StrIndex.retrieve_string ~limit:1 idx "hell" |> Levenshtein.klist_to_list;;
|
|
||||||
*)
|
|
||||||
|
|
@ -1,200 +0,0 @@
|
||||||
(*
|
|
||||||
copyright (c) 2013, simon cruanes
|
|
||||||
all rights reserved.
|
|
||||||
|
|
||||||
redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are met:
|
|
||||||
|
|
||||||
redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer. redistributions in binary
|
|
||||||
form must reproduce the above copyright notice, this list of conditions and the
|
|
||||||
following disclaimer in the documentation and/or other materials provided with
|
|
||||||
the distribution.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
||||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*)
|
|
||||||
|
|
||||||
|
|
||||||
(** {1 Levenshtein distance}
|
|
||||||
|
|
||||||
We take inspiration from
|
|
||||||
http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
|
|
||||||
for the main algorithm and ideas. However some parts are adapted *)
|
|
||||||
|
|
||||||
type 'a sequence = ('a -> unit) -> unit
|
|
||||||
type 'a gen = unit -> 'a option
|
|
||||||
|
|
||||||
(** {2 Abstraction over Strings}
|
|
||||||
Due to the existence of several encodings and string representations we
|
|
||||||
abstract over the type of strings. A string is a finite array of characters
|
|
||||||
(8-bits char, unicode runes, etc.) which provides a length operation
|
|
||||||
and a function to access the n-th character. *)
|
|
||||||
|
|
||||||
module type STRING = sig
|
|
||||||
type char_
|
|
||||||
type t
|
|
||||||
|
|
||||||
val of_list : char_ list -> t
|
|
||||||
val get : t -> int -> char_
|
|
||||||
val length : t -> int
|
|
||||||
val compare_char : char_ -> char_ -> int
|
|
||||||
end
|
|
||||||
|
|
||||||
(** {2 Continuation list}
|
|
||||||
|
|
||||||
This data structure is used to represent a list of result that is
|
|
||||||
evaluated only as far as the user wants. If the user only wants a few elements,
|
|
||||||
she doesn't pay for the remaining ones.
|
|
||||||
|
|
||||||
In particular, when matching a string against a (big) set of indexed
|
|
||||||
strings, we return a continuation list so that, even if there are many results,
|
|
||||||
only those actually asked for are evaluated. *)
|
|
||||||
|
|
||||||
type 'a klist =
|
|
||||||
unit -> [
|
|
||||||
| `Nil
|
|
||||||
| `Cons of 'a * 'a klist
|
|
||||||
]
|
|
||||||
|
|
||||||
val klist_to_list : 'a klist -> 'a list
|
|
||||||
(** Helper for short lists. *)
|
|
||||||
|
|
||||||
(** {2 Signature}
|
|
||||||
|
|
||||||
The signature for a given string representation provides 3 main things:
|
|
||||||
|
|
||||||
- a [edit_distance] function to compute the edit distance between strings
|
|
||||||
- an [automaton] type that is built from a string [s] and a maximum distance [n],
|
|
||||||
and only accepts the strings [s'] such that [edit_distance s s' <= n].
|
|
||||||
- an [Index] module that can be used to map many strings to values, like
|
|
||||||
a regular string map, but for which retrieval is fuzzy (for a given
|
|
||||||
maximal distance).
|
|
||||||
|
|
||||||
A possible use of the index could be:
|
|
||||||
{[
|
|
||||||
|
|
||||||
let words = CCIO.with_in "/usr/share/dict/words"
|
|
||||||
(fun i -> CCIO.read_all i |> CCString.Split.list_cpy ~by:"\n");;
|
|
||||||
|
|
||||||
let words = List.map (fun s->s,s) words;;
|
|
||||||
let idx = CCLevenshtein.Index.of_list words;;
|
|
||||||
|
|
||||||
CCLevenshtein.Index.retrieve ~limit:1 idx "hell" |> CCLevenshtein.klist_to_list;;
|
|
||||||
]}
|
|
||||||
*)
|
|
||||||
|
|
||||||
module type S = sig
|
|
||||||
type char_
|
|
||||||
type string_
|
|
||||||
|
|
||||||
(** {6 Edit Distance} *)
|
|
||||||
|
|
||||||
val edit_distance : string_ -> string_ -> int
|
|
||||||
(** Edition distance between two strings. This satisfies the classical
|
|
||||||
distance axioms: it is always positive, symmetric, and satisfies
|
|
||||||
the formula [distance a b + distance b c >= distance a c] *)
|
|
||||||
|
|
||||||
(** {6 Automaton}
|
|
||||||
An automaton, built from a string [s] and a limit [n], that accepts
|
|
||||||
every string that is at distance at most [n] from [s]. *)
|
|
||||||
|
|
||||||
type automaton
|
|
||||||
(** Levenshtein automaton *)
|
|
||||||
|
|
||||||
val of_string : limit:int -> string_ -> automaton
|
|
||||||
(** Build an automaton from a string, with a maximal distance [limit].
|
|
||||||
The automaton will accept strings whose {!edit_distance} to the
|
|
||||||
parameter is at most [limit]. *)
|
|
||||||
|
|
||||||
val of_list : limit:int -> char_ list -> automaton
|
|
||||||
(** Build an automaton from a list, with a maximal distance [limit] *)
|
|
||||||
|
|
||||||
val debug_print : (out_channel -> char_ -> unit) ->
|
|
||||||
out_channel -> automaton -> unit
|
|
||||||
(** Output the automaton's structure on the given channel. *)
|
|
||||||
|
|
||||||
val match_with : automaton -> string_ -> bool
|
|
||||||
(** [match_with a s] matches the string [s] against [a], and returns
|
|
||||||
[true] if the distance from [s] to the word represented by [a] is smaller
|
|
||||||
than the limit used to build [a] *)
|
|
||||||
|
|
||||||
(** {6 Index for one-to-many matching} *)
|
|
||||||
|
|
||||||
module Index : sig
|
|
||||||
type 'b t
|
|
||||||
(** Index that maps strings to values of type 'b. Internally it is
|
|
||||||
based on a trie. A string can only map to one value. *)
|
|
||||||
|
|
||||||
val empty : 'b t
|
|
||||||
(** Empty index *)
|
|
||||||
|
|
||||||
val is_empty : _ t -> bool
|
|
||||||
|
|
||||||
val add : 'b t -> string_ -> 'b -> 'b t
|
|
||||||
(** Add a pair string/value to the index. If a value was already present
|
|
||||||
for this string it is replaced. *)
|
|
||||||
|
|
||||||
val cardinal : _ t -> int
|
|
||||||
(** Number of bindings *)
|
|
||||||
|
|
||||||
val remove : 'b t -> string_ -> 'b t
|
|
||||||
(** Remove a string (and its associated value, if any) from the index. *)
|
|
||||||
|
|
||||||
val retrieve : limit:int -> 'b t -> string_ -> 'b klist
|
|
||||||
(** Lazy list of objects associated to strings close to the query string *)
|
|
||||||
|
|
||||||
val of_list : (string_ * 'b) list -> 'b t
|
|
||||||
(** Build an index from a list of pairs of strings and values *)
|
|
||||||
|
|
||||||
val to_list : 'b t -> (string_ * 'b) list
|
|
||||||
(** Extract a list of pairs from an index *)
|
|
||||||
|
|
||||||
val add_seq : 'a t -> (string_ * 'a) sequence -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val of_seq : (string_ * 'a) sequence -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val to_seq : 'a t -> (string_ * 'a) sequence
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val add_gen : 'a t -> (string_ * 'a) gen -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val of_gen : (string_ * 'a) gen -> 'a t
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val to_gen : 'a t -> (string_ * 'a) gen
|
|
||||||
(** @since 0.14 *)
|
|
||||||
|
|
||||||
val fold : ('a -> string_ -> 'b -> 'a) -> 'a -> 'b t -> 'a
|
|
||||||
(** Fold over the stored pairs string/value *)
|
|
||||||
|
|
||||||
val iter : (string_ -> 'b -> unit) -> 'b t -> unit
|
|
||||||
(** Iterate on the pairs *)
|
|
||||||
|
|
||||||
val to_klist : 'b t -> (string_ * 'b) klist
|
|
||||||
(** Conversion to an iterator *)
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
(** {2 Functor} *)
|
|
||||||
|
|
||||||
module Make(Str : STRING) : S
|
|
||||||
with type string_ = Str.t
|
|
||||||
and type char_ = Str.char_
|
|
||||||
|
|
||||||
(** {2 Default instance: string} *)
|
|
||||||
|
|
||||||
include S with type char_ = char and type string_ = string
|
|
||||||
|
|
||||||
val debug_print : out_channel -> automaton -> unit
|
|
||||||
|
|
@ -1,29 +0,0 @@
|
||||||
(*
|
|
||||||
copyright (c) 2013-2015, simon cruanes
|
|
||||||
all rights reserved.
|
|
||||||
|
|
||||||
redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are met:
|
|
||||||
|
|
||||||
redistributions of source code must retain the above copyright notice, this
|
|
||||||
list of conditions and the following disclaimer. redistributions in binary
|
|
||||||
form must reproduce the above copyright notice, this list of conditions and the
|
|
||||||
following disclaimer in the documentation and/or other materials provided with
|
|
||||||
the distribution.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
||||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
||||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
||||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*)
|
|
||||||
|
|
||||||
module App_parse = CCApp_parse
|
|
||||||
module Parse = CCParse
|
|
||||||
module KMP = CCKMP
|
|
||||||
module Levenshtein = CCLevenshtein
|
|
||||||
Loading…
Add table
Reference in a new issue