wip: parser combinators, trying to fix fix

This commit is contained in:
Simon Cruanes 2015-03-05 14:06:30 +01:00
parent ee8c7c03f2
commit 5c7cb55378
3 changed files with 208 additions and 118 deletions

1
_oasis
View file

@ -86,6 +86,7 @@ Library "containers_string"
Path: src/string Path: src/string
Pack: true Pack: true
Modules: KMP, Levenshtein, App_parse Modules: KMP, Levenshtein, App_parse
BuildDepends: bytes
FindlibName: string FindlibName: string
FindlibParent: containers FindlibParent: containers

View file

@ -51,10 +51,26 @@ let print_char_map map =
(fun c _ acc -> str "'%c'" c :: acc) map [] in (fun c _ acc -> str "'%c'" c :: acc) map [] in
String.concat ", " l String.concat ", " l
let set_of_string s =
let set = ref CharSet.empty in
String.iter
(fun c ->
if CharSet.mem c !set
then invalid_arg (str "any_of: duplicate char %c" c);
set := CharSet.add c !set
) s;
!set
(* add [c -> p] to the map, for every [c] in [set] *)
let map_add_set init set p =
CharSet.fold
(fun c map -> CharMap.add c p map)
set init
(* function composition *) (* function composition *)
let compose f g x = f (g x) let compose f g x = f (g x)
let string_of_list l = let str_of_l l =
let b = Bytes.make (List.length l) ' ' in let b = Bytes.make (List.length l) ' ' in
List.iteri (fun i c -> Bytes.set b i c) l; List.iteri (fun i c -> Bytes.set b i c) l;
Bytes.unsafe_to_string b Bytes.unsafe_to_string b
@ -73,36 +89,42 @@ type _ t =
| Many : CharSet.t * 'a t * unit t * multiplicity -> 'a list t | Many : CharSet.t * 'a t * unit t * multiplicity -> 'a list t
| Skip : CharSet.t * 'a t * multiplicity -> unit t (* same as Many, but ignores *) | Skip : CharSet.t * 'a t * multiplicity -> unit t (* same as Many, but ignores *)
| SwitchC : 'a t CharMap.t * 'a t option -> 'a t | SwitchC : 'a t CharMap.t * 'a t option -> 'a t
| SwitchS :'a trie -> 'a t | Lazy : 'a t Lazy.t -> 'a t
| Fix : ('a t -> 'a t) -> 'a t
| Eof : unit t | Eof : unit t
(* a prefix trie *)
and 'a trie =
| TrieEmpty
| TrieNode of 'a t option * 'a trie CharMap.t
let return x = Return x let return x = Return x
let pure = return
let success = Return () let success = Return ()
let fail msg = Fail msg let fail msg = Fail msg
let app f x = App (f, x) let failf fmt = Printf.ksprintf (fun msg -> fail msg) fmt
let map f x = match x with let map f x = match x with
| Map (g, y) -> Map (compose f g, y) | Map (g, y) -> Map (compose f g, y)
| Return x -> Return (f x) | Return x -> Return (f x)
| _ -> Map (f,x) | _ -> Map (f,x)
let filter f x = Filter (f, x) let app f x = match f with
| Return f -> map f x
| _ -> App (f, x)
let filter f x = match x with
| Return y -> if f y then Return y else fail "filter failed"
| Filter (f', y) -> Filter ((fun x -> f' x && f x), y)
| _ -> Filter (f, x)
let app_left a b = AppLeft (a, b) (* return (fun x y -> x) <*> a <*> b *)
let app_right a b = AppRight (a, b) (* return (fun x y -> y) <*> a <*> b *)
let int = Int let int = Int
let float = Float let float = Float
let int_first_char = let int_first_char = lazy (set_of_string "-0123456789")
lazy (CharSet.of_list ['-'; '0'; '1'; '2'; '3'; '4'; '5'; '6'; '7'; '8'; '9']) let float_first_char = lazy (set_of_string ".-0123456789")
(* a set of characters that are valid as first characters of a parser *) (* a set of characters that are valid as first characters of a parser *)
type possible_first_chars = type possible_first_chars =
@ -111,6 +133,15 @@ type possible_first_chars =
| NoChar | NoChar
| IsFail of string | IsFail of string
let ret_set set =
if CharSet.cardinal set = 256 then AllChars else Set set
let union_possible_first_chars a b = match a, b with
| Set a, Set b -> Set (CharSet.union a b)
| IsFail e, _ | _, IsFail e -> IsFail e
| AllChars, _ | _, AllChars -> AllChars
| NoChar, o | o, NoChar -> o
(* set of possibilities for the first char of a parser *) (* set of possibilities for the first char of a parser *)
let rec possible_first_chars let rec possible_first_chars
: type a. a t -> possible_first_chars : type a. a t -> possible_first_chars
@ -123,69 +154,46 @@ let rec possible_first_chars
| AppRight (a, _) -> possible_first_chars a | AppRight (a, _) -> possible_first_chars a
| Fail e -> IsFail e | Fail e -> IsFail e
| Int -> Set (Lazy.force int_first_char) | Int -> Set (Lazy.force int_first_char)
| Float -> Set (Lazy.force int_first_char) | Float -> Set (Lazy.force float_first_char)
| AnyOf set -> Set set | AnyOf set -> ret_set set
| Many(set, _, _, _) -> Set set | Many(set, p, _, (Question | Star)) ->
| Skip (set, _, _) -> Set set union_possible_first_chars (ret_set set) (possible_first_chars p)
| SwitchC (map, None) -> Set (domain_of_char_map map) | Skip (set, p, (Question | Star)) ->
union_possible_first_chars (ret_set set) (possible_first_chars p)
| Many (set, _, _, Plus) -> Set set
| Skip (set, _, Plus) -> ret_set set
| SwitchC (map, None) -> ret_set (domain_of_char_map map)
| SwitchC (_, Some _) -> AllChars | SwitchC (_, Some _) -> AllChars
| SwitchS TrieEmpty -> assert false | Lazy (lazy p) -> possible_first_chars p
| SwitchS (TrieNode (_, m)) -> Set (domain_of_char_map m)
| Fix f ->
let p = f (Fix f) in
possible_first_chars p
| Eof -> NoChar | Eof -> NoChar
let many ?(sep=success) t = let many_ ~sep ~mult ~p = match possible_first_chars p with
match possible_first_chars t with | Set set -> Many (set, p, sep, mult)
| Set set -> Many (set, t, sep, Star)
| IsFail msg -> Fail msg | IsFail msg -> Fail msg
| AllChars -> invalid_arg (str "many: invalid parser (always succeeds)") | AllChars -> invalid_arg (str "many: invalid parser (always succeeds)")
| NoChar -> invalid_arg (str "many: invalid parser (does not consume input)") | NoChar -> invalid_arg (str "many: invalid parser (does not consume input)")
let many1 ?(sep=success) t = let many ?(sep=success) p = many_ ~sep ~mult:Star ~p
match possible_first_chars t with
| Set set -> Many (set, t, sep, Plus)
| IsFail msg -> Fail msg
| AllChars -> invalid_arg (str "many: invalid parser (always succeeds)")
| NoChar -> invalid_arg (str "many: invalid parser (does not consume input)")
let skip t = let many1 ?(sep=success) p = many_ ~sep ~mult:Plus ~p
match possible_first_chars t with
| Set set -> Skip (set, t, Star) let skip_ ~mult ~p = match possible_first_chars p with
| Set set -> Skip (set, p, mult)
| IsFail msg -> Fail msg | IsFail msg -> Fail msg
| AllChars -> invalid_arg (str "skip: invalid parser (always succeeds)") | AllChars -> invalid_arg (str "skip: invalid parser (always succeeds)")
| NoChar -> invalid_arg (str "skip: invalid parser (does not consume input)") | NoChar -> invalid_arg (str "skip: invalid parser (does not consume input)")
let skip1 t = let skip p = skip_ ~mult:Star ~p
match possible_first_chars t with
| Set set -> Skip (set, t, Plus)
| IsFail msg -> Fail msg
| AllChars -> invalid_arg (str "skip: invalid parser (always succeeds)")
| NoChar -> invalid_arg (str "skip: invalid parser (does not consume input)")
let opt t = let skip1 p = skip_ ~mult:Plus ~p
match possible_first_chars t with
| Set set -> let opt p =
map map
(function (function
| [x] -> Some x | [x] -> Some x
| [] -> None | [] -> None
| _ -> assert false | _ -> assert false
) (Many (set, t, success, Question)) ) (many_ ~sep:success ~mult:Question ~p)
| IsFail msg -> Fail msg
| AllChars -> map (fun x -> Some x) t (* always succeeds *)
| NoChar -> invalid_arg (str "opt: invalid parser (does not consume input)")
let set_of_string s =
let set = ref CharSet.empty in
String.iter
(fun c ->
if CharSet.mem c !set
then invalid_arg (str "any_of: duplicate char %c" c);
set := CharSet.add c !set
) s;
!set
let any_of s = AnyOf (set_of_string s) let any_of s = AnyOf (set_of_string s)
@ -226,34 +234,86 @@ let switch_c ?default l =
exception ExnIsFail of string exception ExnIsFail of string
let choice l = (* binary choice *)
if l = [] then invalid_arg "choice: empty list"; let rec merge a b =
(* build a switch by first char *) (* build a switch by first char *)
try try
(* a map and possibly a default parser *) begin match a, b with
let map, def = List.fold_left | SwitchC (map_a, def_a), SwitchC (map_b, def_b) ->
(fun (map, def) p -> (* merge jump tables *)
match possible_first_chars p, def with let def = match def_a, def_b with
| AllChars, Some _ -> | None, None -> None
invalid_arg "choice: ambiguous, several parsers accept any input" | Some d, None
| AllChars, None -> map, Some p | None, Some d -> Some d
| NoChar, _ -> map, def | Some _, Some _ ->
| IsFail msg, _ -> raise (ExnIsFail msg) invalid_arg "choice: ambiguous, several parsers accept any input"
| Set set, def -> in
if CharSet.exists (fun c -> CharMap.mem c map) set let map = CharMap.merge
then invalid_arg (fun _ a b -> match a, b with
(str "choice: ambiguous parsers (overlap on {%s})" | Some a', Some b' -> Some (merge a' b')
(print_char_set (CharSet.inter set (domain_of_char_map map)))); | Some m, None
let map = CharSet.fold | None, Some m -> Some m
(fun c map -> CharMap.add c p map) | None, None -> assert false
set map ) map_a map_b
in map, def in
) (CharMap.empty, None) l SwitchC (map, def)
in | SwitchC (map, def), other
SwitchC (map, def) | other, SwitchC (map, def) ->
let map', def' = match possible_first_chars other, def with
| AllChars, _ ->
invalid_arg "choice: ambiguous, several parsers accept any input"
| NoChar, None -> map, Some other
| NoChar, Some _ ->
invalid_arg "choice: ambiguous"
| IsFail msg, _ -> raise (ExnIsFail msg)
| Set set, def ->
if CharSet.exists (fun c -> CharMap.mem c map) set
then invalid_arg
(str "choice: ambiguous parsers (overlap on {%s})"
(print_char_set (CharSet.inter set (domain_of_char_map map))));
(* else: merge jump tables *)
let map = map_add_set map set other in
map, def
in
SwitchC (map', def')
| _ ->
begin match possible_first_chars a, possible_first_chars b with
| Set set1, Set set2 ->
if CharSet.exists (fun c -> CharSet.mem c set2) set1
then invalid_arg
(str "choice: ambiguous parsers (overlap on {%s})"
(print_char_set (CharSet.inter set1 set2)));
let map = map_add_set CharMap.empty set1 a in
let map = map_add_set map set2 b in
SwitchC (map, None)
| IsFail e, _ | _, IsFail e -> raise (ExnIsFail e)
| Set s, NoChar -> SwitchC (map_add_set CharMap.empty s a, Some b)
| NoChar, Set s -> SwitchC (map_add_set CharMap.empty s b, Some a)
| AllChars, _ | _, AllChars ->
invalid_arg "choice: ambiguous parsers (one accepts everything)"
| NoChar, NoChar ->
invalid_arg "choice: ambiguous parsers (both accept nothing)"
end
end
with ExnIsFail msg -> with ExnIsFail msg ->
fail msg fail msg
let rec choice l = match l with
| [] -> invalid_arg "choice: empty list";
| [x] -> x
| a :: b :: tail -> choice (merge a b :: tail)
(* temporary structure for buildings switches *)
type 'a trie =
| TrieLeaf of 'a t
| TrieNode of 'a trie CharMap.t
let trie_empty = TrieNode CharMap.empty
let rec parser_of_trie : type a. a trie -> a t = function
| TrieLeaf p -> p
| TrieNode m -> SwitchC (CharMap.map parser_of_trie m, None)
(* build prefix trie *) (* build prefix trie *)
let switch_s l = let switch_s l =
if l = [] then invalid_arg "switch_s: empty list"; if l = [] then invalid_arg "switch_s: empty list";
@ -261,54 +321,64 @@ let switch_s l =
let rec add_trie t s i p = let rec add_trie t s i p =
if i = String.length s if i = String.length s
then match t with then match t with
| TrieEmpty -> TrieNode (Some p, CharMap.empty) | TrieNode m when CharMap.is_empty m -> TrieLeaf p
| TrieNode (Some _, _) -> invalid_arg (str "duplicate key \"%s\"" s) | TrieNode _ -> invalid_arg (str "key \"%s\" is prefix of another key" s)
| TrieNode (None, m) -> TrieNode (Some p, m) | TrieLeaf _ -> invalid_arg (str "duplicate key \"%s\"" s)
else else
let c = String.get s i in let c = String.get s i in
match t with match t with
| TrieEmpty -> | TrieLeaf _ ->
let sub = add_trie TrieEmpty s (i+1) p in invalid_arg (str "key \"%s\" is prefixed by another key" s)
TrieNode (None, CharMap.singleton c sub) | TrieNode map ->
| TrieNode (opt, map) ->
try try
let sub = CharMap.find c map in let sub = CharMap.find c map in
let sub = add_trie sub s (i+1) p in let sub = add_trie sub s (i+1) p in
TrieNode (opt, CharMap.add c sub map) TrieNode (CharMap.add c sub map)
with Not_found -> with Not_found ->
let sub = add_trie TrieEmpty s (i+1) p in let sub = add_trie trie_empty s (i+1) p in
TrieNode (opt, CharMap.add c sub map) TrieNode (CharMap.add c sub map)
in in
let trie = let trie =
List.fold_left List.fold_left
(fun trie (s, p) -> (fun trie (s, p) ->
if s = "" then invalid_arg "switch_s: empty string"; if s = "" then invalid_arg "switch_s: empty string";
add_trie trie s 0 p add_trie trie s 0 p
) TrieEmpty l ) trie_empty l
in in
SwitchS trie parser_of_trie trie
let bool = let bool =
switch_s switch_s
[ "true", Return true [ "true", return true
; "false", Return false ; "false", return false
] ]
let fix f = Fix f let delay p = Lazy p
(* FIXME: does not work in practice. Must separate definition of combinators
from compilation to decision tree *)
let fix f =
let rec r = lazy (f r) in
Lazy.force r
module Infix = struct module Infix = struct
let (>|=) x f = map f x let (>|=) x f = map f x
let (<*>) = app let (<*>) = app
let (<<) a b = AppLeft (a, b) (* return (fun x y -> x) <*> a <*> b *) let (<<) = app_left
let (>>) a b = AppRight (a, b) (* return (fun x y -> y) <*> a <*> b *) let (>>) = app_right
let (<+>) a b = choice [a; b] let (<+>) = merge
let (<::>) a b = pure (fun x l -> x::l) <*> a <*> b
end end
include Infix include Infix
(* TODO: more efficient version, with buffer *) (* TODO: more efficient version, with buffer *)
let word = let word =
return (fun c s -> string_of_list (c :: s)) <*> alpha <*> many alpha_num pure (fun c s -> str_of_l (c :: s)) <*> alpha <*> many alpha_num
(* TODO *)
let quoted =
Lazy (lazy (failwith "quoted: not implemented"))
(** {2 Signatures} *) (** {2 Signatures} *)
@ -495,23 +565,7 @@ module MakeFromReader(R : READER) : S with type source = R.source = struct
| Some d -> parse_rec r d | Some d -> parse_rec r d
end end
end end
| SwitchS TrieEmpty -> assert false | Lazy (lazy p) -> parse_rec r p
| SwitchS (TrieNode (Some p, _)) ->
parse_rec r p
| SwitchS (TrieNode (None, map)) ->
begin match R.next r with
| EOF -> errorf r "expected any of {%s}, got EOF" (print_char_map map)
| Yield c ->
begin try
let trie = CharMap.find c map in
parse_rec r (SwitchS trie) (* recurse in subtree *)
with Not_found ->
errorf r "expected any of {%s}, got %c" (print_char_map map) c
end
end
| Fix f ->
let p = f (Fix f) in
parse_rec r p
| Eof -> | Eof ->
begin match R.next r with begin match R.next r with
| EOF -> () | EOF -> ()

View file

@ -26,6 +26,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
(** {1 Applicative Parser Combinators} (** {1 Applicative Parser Combinators}
Example: basic S-expr parser
{[
open Containers_string.App_parse;;
type sexp = Atom of string | List of sexp list;;
let mkatom a = Atom a;;
let mklist l = List l;;
let sexp = fix (fun sexp ->
spaces >>
((word >|= mkatom) <+>
((char '(' >> many (delay sexp) << char ')') >|= mklist)
)
);;
]}
{b status: experimental} {b status: experimental}
@since NEXT_RELEASE @since NEXT_RELEASE
*) *)
@ -40,10 +59,13 @@ type 'a t
val return : 'a -> 'a t val return : 'a -> 'a t
(** Parser that succeeds with the given value *) (** Parser that succeeds with the given value *)
val pure : 'a -> 'a t
(** Synonym to {!return} *)
val fail : string -> 'a t val fail : string -> 'a t
(** [fail msg] fails with the given error message *) (** [fail msg] fails with the given error message *)
(* TODO: a format version of fail *) val failf : ('a, unit, string, 'b t) format4 -> 'a
val app : ('a -> 'b) t -> 'a t -> 'b t val app : ('a -> 'b) t -> 'a t -> 'b t
(** Applicative *) (** Applicative *)
@ -81,6 +103,12 @@ val word : string t
not containing any whitespace nor delimiter not containing any whitespace nor delimiter
TODO: specify *) TODO: specify *)
val quoted : string t
(** Quoted string, following OCaml conventions *)
val str_of_l : char list -> string
(** Helper to build strings from lists of chars *)
val spaces : unit t val spaces : unit t
(** Parse a sequence of ['\t'] and [' '] *) (** Parse a sequence of ['\t'] and [' '] *)
@ -132,11 +160,15 @@ val choice : 'a t list -> 'a t
@raise Invalid_argument if the list is empty, or if some parsers @raise Invalid_argument if the list is empty, or if some parsers
overlap, making the choice ambiguous *) overlap, making the choice ambiguous *)
val fix : ('a t -> 'a t) -> 'a t val delay : 'a t Lazy.t -> 'a t
(** delay evaluation. Useful in combination with {!fix} *)
val fix : ('a t Lazy.t -> 'a t) -> 'a t
(** [fix f] makes a fixpoint *) (** [fix f] makes a fixpoint *)
module Infix : sig module Infix : sig
val (>|=) : 'a t -> ('a -> 'b) -> 'b t val (>|=) : 'a t -> ('a -> 'b) -> 'b t
(** Infix version of {!map} *)
val (<*>) : ('a -> 'b) t -> 'a t -> 'b t val (<*>) : ('a -> 'b) t -> 'a t -> 'b t
(** Synonym to {!app} *) (** Synonym to {!app} *)
@ -149,6 +181,9 @@ module Infix : sig
val (<+>) : 'a t -> 'a t -> 'a t val (<+>) : 'a t -> 'a t -> 'a t
(** [a <+> b] is [choice [a;b]], a binary choice *) (** [a <+> b] is [choice [a;b]], a binary choice *)
val (<::>) : 'a t -> 'a list t -> 'a list t
(** [a <::> b] is [app (fun x l -> x::l) a b] *)
end end
include module type of Infix include module type of Infix