From 39f5e135bd43961c85690354378568ab1065ce2e Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Wed, 11 Jun 2014 00:47:42 +0200 Subject: [PATCH] crazy input-driven parser combinators --- _oasis | 2 +- misc/parseReact.ml | 235 ++++++++++++++++++++++++++++++++++++++++++++ misc/parseReact.mli | 113 +++++++++++++++++++++ 3 files changed, 349 insertions(+), 1 deletion(-) create mode 100644 misc/parseReact.ml create mode 100644 misc/parseReact.mli diff --git a/_oasis b/_oasis index 2e97f303..d878e97e 100644 --- a/_oasis +++ b/_oasis @@ -61,7 +61,7 @@ Library "containers_misc" UnionFind, SmallSet, AbsSet, CSM, ActionMan, QCheck, BencodeOnDisk, TTree, HGraph, Automaton, Conv, Bidir, Iteratee, - Ty, Tell, BencodeStream, RatTerm, Cause, AVL + Ty, Tell, BencodeStream, RatTerm, Cause, AVL, ParseReact BuildDepends: unix,containers FindlibName: misc FindlibParent: containers diff --git a/misc/parseReact.ml b/misc/parseReact.ml new file mode 100644 index 00000000..1549db83 --- /dev/null +++ b/misc/parseReact.ml @@ -0,0 +1,235 @@ + +(* +copyright (c) 2013-2014, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Parser combinators driven by the input} *) + +type ('a, 'b) t = + | Return : 'b -> ('a,'b) t + | Delay : (unit -> ('a, 'b) t) -> ('a, 'b) t + | One : ('a, 'a) t + | Stop : ('a, unit) t + | Bind : ('a, 'b) t * ('b -> ('a, 'c) t) -> ('a, 'c) t + | Choice : ('a, 'b) t * ('a, 'b) t -> ('a, 'b) t + | Map : ('a, 'b) t * ('b -> 'c) -> ('a, 'c) t + | Guard : ('a, 'b) t * ('b -> bool) -> ('a, 'b) t + | Skip : ('a, unit) t + | IfThenElse: ('a -> bool) * ('a, 'b) t * ('a, 'b) t -> ('a, 'b) t + | Fail : ('a, 'b) t + +let stop = Stop + +let return x = Return x + +let delay f = Delay f + +let return' f = Delay (fun () -> return (f ())) + +let fail = Fail + +let one = One + +let skip = Skip + +let bind f p = Bind (p, f) + +let (>>=) p f = bind f p + +let exact ?(eq=(=)) x = + one + >>= fun y -> + if eq x y then Return () else Fail + +let guard f p = Guard (p, f) + +let (>>) p1 p2 = p1 >>= fun _ -> p2 + +let map f p = Map (p, f) + +let (>>|) p f = Map (p, f) + +let (<|>) p1 p2 = Choice (p1, p2) + +let pair p1 p2 = + p1 >>= fun x1 -> + p2 >>= fun x2 -> + return (x1, x2) + +let triple p1 p2 p3 = + p1 >>= fun x1 -> + p2 >>= fun x2 -> + p3 >>= fun x3 -> + return (x1, x2, x3) + +let if_then_else p a b = IfThenElse (p, a, b) + +(** {6 Utils} *) + +let take_while pred = + let rec next acc = + if_then_else pred + (one >>= fun x -> next (x::acc)) + (return' (fun () -> List.rev acc)) + in + next [] + +let take_n n = + let rec next acc n = + if n = 0 + then return (List.rev acc) + else one >>= fun x -> next (x::acc) (n-1) + in + next [] n + +let skip_spaces = + let rec next () = + if_then_else + (fun c -> c = ' ' || c = '\t' || c = '\n') + (skip >> delay next) + (return ()) + in next () + +let ident = + let accept = function + | c when Char.code c >= Char.code 'a' && Char.code c <= Char.code 'z' -> true + | c when Char.code c >= Char.code 'A' && Char.code c <= Char.code 'Z' -> true + | c when Char.code c >= Char.code '0' && Char.code c <= Char.code '9' -> true + | _ -> false + in + let rec aggregate buf = + if_then_else + accept + (one >>= fun c -> Buffer.add_char buf c; aggregate buf) + (return (Buffer.contents buf)) + in + (* create buffer on demand, to avoid sharing it *) + delay (fun () -> aggregate (Buffer.create 32)) + +let many ~sep p = + let rec next acc = + (return (List.rev acc)) + <|> (p >>= fun x -> sep >> next (x::acc)) + in + next [] + +let many1 ~sep p = + let rec next acc = + p >>= fun x -> + let acc = x :: acc in + (return (List.rev acc)) + <|> (sep >> next acc) + in + next [] + +(** {6 Run} *) + +type 'a sequence = ('a -> unit) -> unit + +let _fold_seq f acc seq = + let acc = ref acc in + seq (fun x -> acc := f !acc x); + !acc + +(** Partial state during parsing: a tree of continuations *) +type (_, _) state = + | STBottom : 'b -> ('a, 'b) state + | STPush : ('a, 'c) t * ('c -> ('a, 'b) state list) -> ('a, 'b) state + +let (>>>) p cont = STPush (p, cont) + +let run p seq = + (* normalize the stack (do not let a "return" on top) *) + let rec reduce : type a b. (a,b)state -> (a,b) state list + = fun stack -> match stack with + | STPush (Return x, cont) -> CCList.flat_map reduce (cont x) + | STPush (Delay f, cont) -> reduce (f () >>> cont) + | STPush (Bind (p, f), cont) -> + let stack' = p >>> fun x -> [f x >>> cont] in + reduce stack' + | STPush (Choice (a, b), cont) -> + (* fork into sub-stacks *) + CCList.append (reduce (a >>> cont)) (reduce (b >>> cont)) + | STPush (Map (p, f), cont) -> + let stack' = p >>> fun x -> cont (f x) in + reduce stack' + | STPush (Guard (p, f), cont) -> + let stack' = p >>> fun x -> if f x then cont x else [] in + reduce stack' + | _ -> [stack] + in + (* consume one input token *) + let rec consume_one : type a b. (a,b) state -> a -> (a,b) state list + = fun stack x -> match stack with + | STBottom _ -> [] (* fail *) + | STPush (Stop, _) -> [] (* fail *) + | STPush (Fail, _) -> [] (* fail *) + | STPush (One, cont) -> CCList.flat_map reduce (cont x) + | STPush (Skip, cont) -> CCList.flat_map reduce (cont ()) + | STPush (IfThenElse (p, yay, nay), cont) -> + let l = if p x + then reduce (yay >>> cont) + else reduce (nay >>> cont) + in + CCList.flat_map (fun stack -> consume_one stack x) l + | STPush (Return _, _) -> assert false + | STPush (Delay _, _) -> assert false + | STPush (Bind _, _) -> assert false + | STPush (Choice _, _) -> assert false + | STPush (Map _, _) -> assert false + | STPush (Guard _, _) -> assert false + in + (* to be called at the end of input *) + let finish : type a b. (a,b) state -> (a,b) state list + = fun stack -> match stack with + | STPush (Stop, cont) -> CCList.flat_map reduce (cont ()) + | STPush (Fail, _) -> [] + | _ -> [stack] + in + (* how to parse the input: step by step, starting with [p] as initial parser *) + let step l x = CCList.flat_map (fun p -> consume_one p x) l in + let initial_state = p >>> fun x -> [STBottom x] in + let res = _fold_seq step [initial_state] seq in + (* signal "end of input" *) + let res = CCList.flat_map finish res in + (* recover results *) + CCList.filter_map + (function + | STBottom x -> Some x + | _ -> None + ) res + +(*$R + type sexp = Atom of string | List of sexp list \ + let atom i = Atom i \ + let list_ i = List i \ + + let rec p () = + (skip_spaces >> ident >>= atom) + <|> (skip_spaces >> exact '(' >> many1 ~sep:(exact ' ') (delay p) >>= fun l + >> skip_spaces >> exact ')' >> return (list_ l)) + in + run (p ()) (CCSequence.of_string "(a b (c d))") = + [list_ [atom "a"; atom "b"; list_ [atom "c"; atom "d"]]] +*) diff --git a/misc/parseReact.mli b/misc/parseReact.mli new file mode 100644 index 00000000..da823495 --- /dev/null +++ b/misc/parseReact.mli @@ -0,0 +1,113 @@ + +(* +copyright (c) 2013-2014, simon cruanes +all rights reserved. + +redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Parser combinators driven by the input} *) + +type ('input, 'result) t +(** parser that takes some type as input and outputs a value of type 'result +when it's done *) + +(** {6 Basic Building Blocs} *) + +val stop : ('a, unit) t +(** Succeed exactly at the end of input *) + +val return : 'b -> ('a, 'b) t +(** Return a value *) + +val return' : (unit -> 'b) -> ('a, 'b) t +(** Suspended version of {!return}, does not evaluate yet *) + +val delay : (unit -> ('a, 'b) t) -> ('a, 'b) t +(** Delay evaluation of the parser *) + +val fail : ('a, 'b) t +(** Failure *) + +val one : ('a, 'a) t +(** Parse one value exactly *) + +val skip : ('a, unit) t +(** Ignore the next value *) + +val exact : ?eq:('a -> 'a -> bool) -> 'a -> ('a, unit) t +(** Accept one value as input exactly *) + +val guard : ('b -> bool) -> ('a, 'b) t -> ('a, 'b) t +(** Ensure the return value of the given parser satisfies the predicate. + [guard f p] will be the same as [p] if [p] returns + some [x] with [f x = true]. If [not (f x)], then [guard f p] fails. *) + +val bind : ('b -> ('a, 'c) t) -> ('a, 'b) t -> ('a, 'c) t + +val (>>=) : ('a, 'b) t -> ('b -> ('a, 'c) t) -> ('a, 'c) t + +val (>>) : ('a, 'b) t -> ('a, 'c) t -> ('a, 'c) t +(** Wait for the first parser to succeed, then switch to the second one *) + +val map : ('b -> 'c) -> ('a, 'b) t -> ('a, 'c) t +(** Map outputs *) + +val (>>|) : ('a, 'b) t -> ('b -> 'c) -> ('a, 'c) t +(** Infix version of {!map} *) + +val (<|>) : ('a, 'b) t -> ('a, 'b) t -> ('a, 'b) t +(** Non-deterministic choice. Both branches are evaluated in parallel *) + +val pair : ('a,'b) t -> ('a, 'c) t -> ('a, ('b * 'c)) t +val triple : ('a,'b) t -> ('a, 'c) t -> ('a, 'd) t -> ('a, ('b * 'c * 'd)) t + +val if_then_else : ('a -> bool) -> ('a, 'b) t -> ('a, 'b) t -> ('a, 'b) t +(** Test the next input, and choose the parser based on it. Does not consume + the input token for the test *) + +(** {6 Utils} *) + +val take_while : ('a -> bool) -> ('a, 'a list) t +(** Take input while it satisfies the given predicate *) + +val take_n : int -> ('a, 'a list) t +(** Take n input elements *) + +val skip_spaces : (char, unit) t +(** Skip whitespace (space,tab,newline) *) + +val ident : (char, string) t +(** Parse identifiers (stops on whitespaces) *) + +val many : sep:('a,_) t -> ('a, 'b) t -> ('a, 'b list) t +(** [many ~sep p] parses as many [p] as possible, separated by [sep]. *) + +val many1 : sep:('a,_) t -> ('a, 'b) t -> ('a, 'b list) t + +(** {6 Run} *) + +type 'a sequence = ('a -> unit) -> unit + +val run : ('a,'b) t -> 'a sequence -> 'b list +(** List of results. Each element of the list comes from a successful + series of choices [<|>]. If no choice operator was used, the list + contains 0 or 1 elements *)