parser combinators for deserializing token sequences directly into

any type, without building the intermediate Sexpr.t. GADT are now necessary, to keep the
Sexpr.parse function type-safe
This commit is contained in:
Simon Cruanes 2013-02-08 20:26:06 +01:00
parent af0c94dab4
commit cce20838c4
2 changed files with 205 additions and 0 deletions

148
sexpr.ml
View file

@ -28,6 +28,8 @@ type t =
(** Token that compose a Sexpr once serialized *)
type token = [`Open | `Close | `Atom of string]
(** {2 Traverse a sequence of tokens} *)
(** Iterate on the S-expression, calling the callback with tokens *)
let rec iter f s = match s with
| Atom a -> f (`Atom a)
@ -39,6 +41,23 @@ and iter_list f l = match l with
(** Traverse. This yields a sequence of tokens *)
let traverse s = Sequence.from_iter (fun k -> iter k s)
(** Returns the same sequence of tokens, but during iteration, if
the structure of the Sexpr corresponding to the sequence
is wrong (bad parenthesing), Invalid_argument is raised
and iteration is stoped *)
let validate seq =
let depth = ref 0 in
Sequence.map
(fun tok -> match tok with
| `Open -> incr depth; tok
| `Close -> if !depth = 0
then raise (Invalid_argument "wrong parenthesing")
else decr depth; tok
| _ -> tok)
seq
(** {2 Text <-> tokens} *)
(** Lex: create a sequence of tokens from the given in_channel. *)
let lex input =
let seq_fun k =
@ -86,6 +105,8 @@ let of_seq seq =
| [] -> failwith "no Sexpr could be parsed"
| _ -> failwith "too many elements on the stack"
(** {2 Printing} *)
(** Print a token on the given formatter *)
let pp_token formatter token = match token with
| `Open -> Format.fprintf formatter "@[("
@ -113,3 +134,130 @@ let pp_sexpr ?(indent=false) formatter s =
then Format.fprintf formatter "@[<hov 4>%a@]" pp_tokens (traverse s)
else pp_tokens formatter (traverse s)
(** {2 Parsing} *)
(** Monadic combinators for parsing data from a sequence of tokens,
without converting to concrete S-expressions.
The [one] parser can raise ParseFailure if it fails to parse
the atomic type. *)
(** parser that returns a 'a *)
type 'a parser =
| Return : 'a -> 'a parser
| One : (token -> 'a) -> 'a parser
| Zero : (token -> 'a parser) -> 'a parser
(* | Maybe of (token -> 'a option) *)
| Bind : ('b parser * ('b -> 'a parser)) -> 'a parser
| Fail : string -> 'a parser
exception ParseFailure of string
let (>>=) p f = Bind (p, f)
let (>>) p p' = p >>= fun _ -> p'
let return x = Return x
let fail reason = Fail reason
let one f = One f
let lookahead f = Zero f
let left = One (function | `Open -> ()
| _ -> raise (ParseFailure "expected '('"))
let right = One (function | `Close -> ()
| _ -> raise (ParseFailure "expected ')'"))
let pair f g =
f >>= fun x ->
g >>= fun y ->
return (x, y)
let triple f g h =
f >>= fun x ->
g >>= fun y ->
h >>= fun z ->
return (x, y, z)
(** Maps the value returned by the parser *)
let map p f = p >>= fun x -> return (f x)
let p_str = one
(function | `Atom s -> s | _ -> raise (ParseFailure "expected string"))
let p_int = one
(function | `Atom s -> (try int_of_string s
with Failure _ -> raise (ParseFailure "expected int"))
| _ -> raise (ParseFailure "expected int"))
let p_bool = one
(function | `Atom s -> (try bool_of_string s
with Failure _ -> raise (ParseFailure "expected bool"))
| _ -> raise (ParseFailure "expected bool"))
let p_float = one
(function | `Atom s -> (try float_of_string s
with Failure _ -> raise (ParseFailure "expected float"))
| _ -> raise (ParseFailure "expected float"))
let many p =
let rec elements token =
match token with
| `Close -> return []
| _ ->
p >>= fun x ->
lookahead elements >>= fun l ->
return (x :: l)
in
left >> lookahead elements >>= fun l -> right >> return l
let many1 p =
p >>= fun x ->
many p >>= fun l ->
return (x::l)
(** parsing state that returns a 'a *)
type 'a state =
| Bottom : 'a state
| Push : ('b parser * ('b -> 'a state)) -> 'a state
(** Actually parse the sequence of tokens *)
let parse p tokens =
let res = ref None in
let state = Push(p, fun x -> (res := Some x; Bottom)) in
(* Token handler. It also takes the current parser. *)
let rec one_step state token =
match state with
| Bottom -> (* should not happen, unless there are too many tokens *)
raise (ParseFailure "unexpected ')'")
| Push (Return x, cont) ->
let state' = cont x in
one_step state' token (* do not consume token *)
| Push (Zero f, cont) ->
let p' = f token in
let state' = Push (p', cont) in
one_step state' token (* do not consume token *)
| Push (One f, cont) ->
let x = f token in
let state' = cont x in
state' (* consume token *)
(* | Maybe f, _ -> let x = f token in (Obj.magic cont) x *)
| Push (Bind (p', cont'), cont) ->
let cont'' x =
let p'' = cont' x in
Push (p'', cont)
in
let state' = Push (p', cont'') in
one_step state' token (* do not consume token *)
| Push (Fail reason, _) -> raise (ParseFailure reason)
in
(* iterate on the tokens *)
ignore (Sequence.fold one_step state tokens);
(* return result *)
match !res with
| None -> raise (ParseFailure "incomplete input")
| Some x -> x

View file

@ -36,6 +36,12 @@ val iter : (token -> unit) -> t -> unit
val traverse : t -> token Sequence.t
(** Traverse. This yields a sequence of tokens *)
val validate : token Sequence.t -> token Sequence.t
(** Returns the same sequence of tokens, but during iteration, if
the structure of the Sexpr corresponding to the sequence
is wrong (bad parenthesing), Invalid_argument is raised
and iteration is stoped *)
(** {2 Text <-> tokens} *)
val lex : char Sequence.t -> token Sequence.t
@ -55,3 +61,54 @@ val pp_tokens : Format.formatter -> token Sequence.t -> unit
val pp_sexpr : ?indent:bool -> Format.formatter -> t -> unit
(** Pretty-print the S-expr. If [indent] is true, the S-expression
is printed with indentation. *)
(** {2 Parsing} *)
(** Monadic combinators for parsing data from a sequence of tokens,
without converting to concrete S-expressions. *)
type 'a parser
exception ParseFailure of string
val (>>=) : 'a parser -> ('a -> 'b parser) -> 'b parser
(** Monadic bind: computes a parser from the result of
the first parser *)
val (>>) : 'a parser -> 'b parser -> 'b parser
(** Like (>>=), but ignores the result of the first parser *)
val return : 'a -> 'a parser
(** Parser that consumes no input and return the given value *)
val fail : string -> 'a parser
(** Fails parsing with the given message *)
val one : (token -> 'a) -> 'a parser
(** consumes one token with the function *)
val lookahead : (token -> 'a parser) -> 'a parser
(** choose parser given current token *)
val left : unit parser
(** Parses a `Open *)
val right : unit parser
(** Parses a `Close *)
val pair : 'a parser -> 'b parser -> ('a * 'b) parser
val triple : 'a parser -> 'b parser -> 'c parser -> ('a * 'b * 'c) parser
val map : 'a parser -> ('a -> 'b) -> 'b parser
(** Maps the value returned by the parser *)
val p_str : string parser
val p_int : int parser
val p_bool : bool parser
val many : 'a parser -> 'a list parser
val many1 : 'a parser -> 'a list parser
val parse : 'a parser -> token Sequence.t -> 'a
(** Actually parse the sequence of tokens. Raises
ParseFailure if anything goes wrong. *)