From 0ec40c2331a2d208366ab3f8ebee83701088cb96 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Tue, 4 May 2021 23:11:41 -0400 Subject: [PATCH] CCParse: heavy refactoring, many new functions --- src/core/CCParse.ml | 778 +++++++++++++++++++++++++++---------------- src/core/CCParse.mli | 195 +++++++---- 2 files changed, 627 insertions(+), 346 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index f8a15cc8..a069052b 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -16,13 +16,13 @@ open CCShims_ let ptree = fix @@ fun self -> skip_space *> - ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')') <|> (U.int >|= mk_leaf) ) let ptree' = fix_memo @@ fun self -> skip_space *> - ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')') <|> (U.int >|= mk_leaf) ) @@ -39,7 +39,7 @@ open CCShims_ let erreq eq x y = match x, y with | Ok x, Ok y -> eq x y | Error _ , Error _ -> true - | _ -> false + | _ -> false ;; *) (*$= & ~printer:errpptree @@ -56,7 +56,7 @@ open CCShims_ (*$R let p = U.list ~sep:"," U.word in let printer = function - | Ok l -> "Ok " ^ CCFormat.(to_string (list string)) l + | Ok l -> "Ok " ^ CCFormat.(to_string (Dump.list string_quoted)) l | Error s -> "Error " ^ s in assert_equal ~printer @@ -84,7 +84,7 @@ open CCShims_ let open CCParse.Infix in let module P = CCParse in - let parens p = P.try_ (P.char '(') *> p <* P.char ')' in + let parens p = P.char '(' *> p <* P.char ')' in let add = P.char '+' *> P.return (+) in let sub = P.char '-' *> P.return (-) in let mul = P.char '*' *> P.return ( * ) in @@ -94,7 +94,7 @@ open CCShims_ let chainl1 e op = P.fix (fun r -> - e >>= fun x -> P.try_ (op <*> P.return x <*> r) <|> P.return x) in + e >>= fun x -> (op <*> P.return x <*> r) <|> P.return x) in let expr : int P.t = P.fix (fun expr -> @@ -107,193 +107,270 @@ open CCShims_ () *) -type 'a or_error = ('a, string) result +module Error = struct + type t = { + msg: unit -> string; + str: string; + offset: int; (* offset in [e_str] *) + } -type line_num = int -type col_num = int + let get_loc_ (self:t) : int * int = + let i = ref 0 in + let continue = ref true in + let line = ref 1 in + let col = ref 1 in + while !continue && !i < self.offset do + match String.index_from self.str !i '\n' with + | exception Not_found -> + col := self.offset - !i; continue := false; + | j when j > self.offset -> + col := self.offset - !i; continue := false; + | j -> incr line; i := j+1; + done; + !line, !col -module MemoTbl = struct - module H = Hashtbl.Make(struct - type t = int * int (* id of parser, position *) - let equal ((a,b):t)(c,d) = a=c && b=d - let hash = Hashtbl.hash - end) + let line_and_column self = get_loc_ self + let msg self = self.msg() + let to_string self = + let line,col = get_loc_ self in + Printf.sprintf "at line %d, char %d:\n%s" line col (self.msg()) + + let pp out self = + let line,col = get_loc_ self in + Format.fprintf out "at line %d, char %d:@ %s" line col (self.msg()) +end + +type 'a or_error = ('a, Error.t) result + +module Memo_tbl = Hashtbl.Make(struct + type t = int * int (* id of parser, position *) + let equal ((a,b):t)(c,d) = a=c && b=d + let hash = Hashtbl.hash + end) + +module Memo_state = struct (* table of closures, used to implement universal type *) - type t = (unit -> unit) H.t lazy_t - - let create n = lazy (H.create n) + type t = (unit -> unit) Memo_tbl.t (* unique ID for each parser *) let id_ = ref 0 - - type 'a res = - | Fail of exn - | Ok of 'a end (* TODO: [type position = {state: state; i: int}] and recompute line, col on demand *) type position = int * int * int (* pos, line, column *) -type parse_branch = (line_num * col_num * string option) list - +(** Purely functional state passed around *) type state = { str: string; (* the input *) - mutable i: int; (* offset *) - mutable lnum : line_num; (* Line number *) - mutable cnum : col_num; (* Column number *) - mutable branch: parse_branch; - memo : MemoTbl.t; (* Memoization table, if any *) + i: int; (* offset in [input.str] *) + memo : Memo_state.t option ref; (* Memoization table, if any *) } -(* TODO: remove lnum/cnum, recompute them lazily in errors *) -exception ParseError of parse_branch * (unit -> string) - -let char_equal (a : char) b = Stdlib.(=) a b -let string_equal (a : string) b = Stdlib.(=) a b - -let rec string_of_branch l = - let pp_s () = function - | None -> "" - | Some s -> Format.sprintf "while parsing %s, " s - in - match l with - | [] -> "" - | [l,c,s] -> - Format.sprintf "@[%aat line %d, col %d@]" pp_s s l c - | (l,c,s) :: tail -> - Format.sprintf "@[%aat line %d, col %d@]@,%s" pp_s s l c (string_of_branch tail) +let[@inline] char_equal (a : char) b = Stdlib.(=) a b +let string_equal = String.equal +(* FIXME: printer for error let () = Printexc.register_printer (function | ParseError (b,msg) -> Some (Format.sprintf "@[%s@ %s@]" (msg()) (string_of_branch b)) | _ -> None) + *) -let const_ x () = x +let[@inline] const_str_ x () : string = x let state_of_string str = let s = { str; i=0; - lnum=1; - cnum=1; - branch=[]; - memo=MemoTbl.create 32; + memo=ref None; } in s -let is_done st = st.i = String.length st.str -let cur st = st.str.[st.i] +let[@inline] is_done st = st.i >= String.length st.str +let[@inline] cur st = st.str.[st.i] -let fail_ ~err st msg = - let b = (st.lnum, st.cnum, None) :: st.branch in - err (ParseError (b, msg)) +let mk_error_ st msg : Error.t = + {Error.msg; str=st.str; offset=st.i} -let next st ~ok ~err = - if st.i = String.length st.str - then fail_ ~err st (const_ "unexpected end of input") - else ( +(* consume one char, passing it to [ok]. *) +let consume_ st ~ok ~err = + if is_done st then ( + let msg = const_str_ "unexpected end of input" in + err (mk_error_ st msg) + ) else ( let c = st.str.[st.i] in - st.i <- st.i + 1; - if char_equal c '\n' - then (st.lnum <- st.lnum + 1; st.cnum <- 1) - else st.cnum <- st.cnum + 1; - ok c + ok {st with i=st.i + 1} c ) -let pos st = st.i, st.lnum, st.cnum - -let backtrack st (i',l',c') = - assert (0 <= i' && i' <= st.i); - st.i <- i'; - st.lnum <- l'; - st.cnum <- c'; - () - -(* FIXME: - remove all backtracking stuff and instead, pass the state as parameter - to [ok] and [err], with an explicit offset that changes. *) - -type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit +type 'a t = { + run: 'b. state -> ok:(state -> 'a -> 'b) -> err:(Error.t -> 'b) -> 'b; +} [@@unboxed] (** Takes the input and two continuations: {ul - {- [ok] to call with the result when it's done} + {- [ok] to call with the result and new state when it's done} {- [err] to call when the parser met an error} } *) -let return : 'a -> 'a t = fun x _st ~ok ~err:_ -> ok x +let return x : _ t = { + run=fun st ~ok ~err:_ -> ok st x +} + let pure = return -let (>|=) : 'a t -> ('a -> 'b) -> 'b t - = fun p f st ~ok ~err -> p st ~ok:(fun x -> ok (f x)) ~err -let (>>=) : 'a t -> ('a -> 'b t) -> 'b t - = fun p f st ~ok ~err -> p st ~ok:(fun x -> f x st ~ok ~err) ~err -let (<*>) : ('a -> 'b) t -> 'a t -> 'b t - = fun f x st ~ok ~err -> - f st ~ok:(fun f' -> x st ~ok:(fun x' -> ok (f' x')) ~err) ~err -let (<* ) : 'a t -> _ t -> 'a t - = fun x y st ~ok ~err -> - x st ~ok:(fun res -> y st ~ok:(fun _ -> ok res) ~err) ~err -let ( *>) : _ t -> 'a t -> 'a t - = fun x y st ~ok ~err -> - x st ~ok:(fun _ -> y st ~ok ~err) ~err + +let (>|=) (p: 'a t) f : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> ok st (f x)) + ~err +} + +let (>>=) (p:'a t) f : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> + let p2 = f x in + p2.run st ~ok ~err) + ~err +} + +let (<*>) (f:_ t) (a:_ t) : _ t = { + run=fun st ~ok ~err -> + f.run st + ~ok:(fun st f -> + a.run st ~ok:(fun st x -> ok st (f x)) ~err) + ~err +} + +let (<*) (a:_ t) (b:_ t) : _ t = { + run=fun st ~ok ~err -> + a.run st + ~ok:(fun st x -> + b.run st ~ok:(fun st _ -> ok st x) ~err) + ~err +} + +let ( *> ) (a:_ t) (b:_ t) : _ t = { + run=fun st ~ok ~err -> + a.run st + ~ok:(fun st _ -> + b.run st ~ok:(fun st x -> ok st x) ~err) + ~err +} let map f x = x >|= f let map2 f x y = pure f <*> x <*> y let map3 f x y z = pure f <*> x <*> y <*> z -let junk_ st = next st ~ok:ignore ~err:(fun _ -> assert false) +let junk_ (st:state) : state = + assert (st.i < String.length st.str); + {st with i=st.i + 1} -let eoi st ~ok ~err = +let eoi = { + run=fun st ~ok ~err -> if is_done st - then ok () - else fail_ ~err st (const_ "expected EOI") + then ok st () + else err (mk_error_ st (const_str_ "expected end of input")) +} -let fail msg st ~ok:_ ~err = fail_ ~err st (const_ msg) +let fail msg : _ t = { + run=fun st ~ok:_ ~err -> + err (mk_error_ st (const_str_ msg)) +} let failf msg = Printf.ksprintf fail msg -let parsing s p st ~ok ~err = - st.branch <- (st.lnum, st.cnum, Some s) :: st.branch; - p st - ~ok:(fun x -> st.branch <- List.tl st.branch; ok x) - ~err:(fun e -> st.branch <- List.tl st.branch; err e) +let parsing what p = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun e -> + let msg() = + Printf.sprintf "while parsing %s:\n%s" what (e.Error.msg ()) + in + err {e with Error.msg}) +} -let nop _ ~ok ~err:_ = ok() +let nop = { + run=fun st ~ok ~err:_ -> ok st (); +} -let char c = - let msg = Printf.sprintf "expected '%c'" c in - fun st ~ok ~err -> - next st - ~ok:(fun c' -> if char_equal c c' then ok c else fail_ ~err st (const_ msg)) ~err +let any_char = { + run=fun st ~ok ~err -> consume_ st ~ok ~err +} -let char_if p st ~ok ~err = - next st - ~ok:(fun c -> - if p c then ok c - else fail_ ~err st (fun () -> Printf.sprintf "unexpected char '%c'" c) - ) ~err +let char c : _ t = { + run=fun st ~ok ~err -> + consume_ st + ~ok:(fun st c2 -> + if char_equal c c2 then ok st c + else ( + let msg() = Printf.sprintf "expected '%c', got '%c'" c (cur st) in + err (mk_error_ st msg) + )) + ~err +} -let chars_if p st ~ok ~err:_ = - let i = st.i in - let len = ref 0 in - while not (is_done st) && p (cur st) do junk_ st; incr len done; - ok (String.sub st.str i !len) +let char_if ?descr p = { + run=fun st ~ok ~err -> + consume_ st + ~ok:(fun st c -> + if p c then ok st c + else ( + let msg() = + let rest = match descr with + | None -> "" + | Some d -> Printf.sprintf ", expected %s" d + in + Printf.sprintf "unexpected char '%c'%s" c rest + in + err (mk_error_ st msg) + )) + ~err +} -let chars1_if p st ~ok ~err = - chars_if p st - ~ok:(fun s -> - if string_equal s "" - then fail_ ~err st (const_ "unexpected sequence of chars") - else ok s) - ~err +let chars_if p = { + run=fun st ~ok ~err:_ -> + let i0 = st.i in + let i = ref i0 in + while + let st = {st with i = !i} in + not (is_done st) && p (cur st) + do + incr i; + done; + ok {st with i = !i} (String.sub st.str i0 (!i - i0)) +} -let rec skip_chars p st ~ok ~err = - if not (is_done st) && p (cur st) then ( - junk_ st; - skip_chars p st ~ok ~err - ) else ok() +let chars1_if ?descr p = { + run=fun st ~ok ~err -> + (chars_if p).run st + ~ok:(fun st s -> + if string_equal s "" + then ( + let msg() = + let what = match descr with + | None -> "" + | Some d -> Printf.sprintf " for %s" d + in + Printf.sprintf "expected non-empty sequence of chars%s" what + in + err (mk_error_ st msg) + ) else ok st s) + ~err +} + +let skip_chars p : _ t = + let rec self = { + run=fun st ~ok ~err -> + if not (is_done st) && p (cur st) then ( + let st = junk_ st in + self.run st ~ok ~err + ) else ok st () + } + in + self let is_alpha = function | 'a' .. 'z' | 'A' .. 'Z' -> true @@ -308,77 +385,136 @@ let is_white = function ' ' | '\t' | '\n' -> true | _ -> false let space = char_if is_space let white = char_if is_white -let endline st ~ok ~err = - next st - ~ok:(function - | '\n' as c -> ok c - | _ -> fail_ ~err st (const_ "expected end-of-line")) - ~err - +let endline = + char_if ~descr:"end-of-line ('\\n')" (function '\n' -> true | _ -> false) let skip_space = skip_chars is_space let skip_white = skip_chars is_white -let (<|>) : 'a t -> 'a t -> 'a t - = fun x y st ~ok ~err -> - let i = st.i in - x st ~ok - ~err:(fun e -> - let j = st.i in - if i=j then y st ~ok ~err (* try [y] *) - else err e (* fail *) +let or_ (p1:'a t) (p2:'a t) : _ t = { + run=fun st ~ok ~err -> + p1.run st ~ok + ~err:(fun _e -> p2.run st ~ok ~err) +} +let (<|>) = or_ + +let (|||) a b = map2 (fun x y ->x,y) a b + +let try_or p1 ~f ~else_:p2 = { + run=fun st ~ok ~err -> + p1.run st + ~ok:(fun st x -> (f x).run st ~ok ~err) + ~err:(fun _ -> p2.run st ~ok ~err) +} + +let suspend f = { + run=fun st ~ok ~err -> + let p = f () in + p.run st ~ok ~err +} + +let () (p:'a t) msg : _ t = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun _e -> err (mk_error_ st (const_str_ msg))) +} + +(* read [len] chars at once *) +let any_chars len : _ t = { + run=fun st ~ok ~err -> + if st.i + len <= String.length st.str then ( + let s = String.sub st.str st.i len in + let st = {st with i = st.i + len} in + ok st s + ) else ( + let msg() = + Printf.sprintf "expected to be able to consume %d chars" len + in + err (mk_error_ st msg) + ) +} + +let exact s = { + run=fun st ~ok ~err -> + (* parse a string of length [String.length s] and compare with [s] *) + (any_chars (String.length s)).run st + ~ok:(fun st s2 -> + if string_equal s s2 then ok st s + else ( + let msg() = Printf.sprintf "expected %S, got %S" s s2 in + err (mk_error_ st msg) + ) + ) + ~err +} + +let string = exact + +let fix f = + let rec self = { + run=fun st ~ok ~err -> + (Lazy.force f_self).run st ~ok ~err + } + and f_self = lazy (f self) in + self + +let try_ p : _ t = { + run=fun st ~ok ~err:_ -> + p.run st + ~ok:(fun st x -> ok st (Some x)) + ~err:(fun _ -> ok st None) +} + +let optional p : _ t = { + run=fun st ~ok ~err:_ -> + p.run st + ~ok:(fun st _x -> ok st ()) + ~err:(fun _ -> ok st ()) +} + +let many_until ~until p : _ t = + fix + (fun self -> + try_or until ~f:(fun _ -> pure []) + ~else_:( + p >>= fun x -> + self >|= fun l -> x :: l + )) + +let many p : _ t = + fix + (fun self -> + try_or p + ~f:(fun x -> self >|= fun tl -> x :: tl) + (pure [])) + +(* +(* parse many [p], as a difference list *) +let many_rec_ p : (_ list -> _ list) t = + let rec self = { + run=fun st ~ok ~err -> + if is_done st then ok st (fun l->l) (* empty list *) + else ( + p.run st + ~ok:(fun st x -> + self.run st + ~ok:(fun st f -> ok st (fun l -> x :: f l)) + ~err) + ~err ) + } in + self -let try_ : 'a t -> 'a t - = fun p st ~ok ~err -> - let i = pos st in - p st ~ok - ~err:(fun e -> - backtrack st i; - err e) - -let suspend f st ~ok ~err = f () st ~ok ~err - -let () : 'a t -> string -> 'a t - = fun x msg st ~ok ~err -> - let i = st.i in - x st ~ok - ~err:(fun e -> - if st.i = i - then fail_ ~err st (fun () -> msg) - else err e) - -let string s st ~ok ~err = - let rec check i = - if i = String.length s then ok s - else - next st - ~ok:(fun c -> - if char_equal c s.[i] - then check (i+1) - else fail_ ~err st (fun () -> Printf.sprintf "expected \"%s\"" s)) - ~err - in - check 0 - -let rec many_rec : 'a t -> 'a list -> 'a list t = fun p acc st ~ok ~err -> - if is_done st then ok(List.rev acc) - else - p st - ~ok:(fun x -> - let i = pos st in - let acc = x :: acc in - many_rec p acc st ~ok - ~err:(fun _ -> - backtrack st i; - ok(List.rev acc)) - ) ~err - -let many : 'a t -> 'a list t - = fun p st ~ok ~err -> many_rec p [] st ~ok ~err +let many p : _ t = { + run=fun st ~ok ~err -> + (many_rec_ p).run st + ~ok:(fun st f -> ok st (f [])) + ~err +} + *) (*$R let p0 = skip_white *> U.int in - let p = (skip_white *> char '(' *> many p0) <* try_ (skip_white <* char ')') in + let p = (skip_white *> char '(' *> many p0) <* (skip_white <* char ')') in let printer = CCFormat.(to_string @@ Dump.result @@ Dump.list int) in assert_equal ~printer (Ok [1;2;3]) (parse_string p "(1 2 3)"); @@ -387,91 +523,165 @@ let many : 'a t -> 'a list t *) -let many1 : 'a t -> 'a list t = - fun p st ~ok ~err -> - p st ~ok:(fun x -> many_rec p [x] st ~ok ~err) ~err - -let rec skip p st ~ok ~err = - let i = pos st in - p st - ~ok:(fun _ -> skip p st ~ok ~err) - ~err:(fun _ -> - backtrack st i; - ok() - ) - -(* by (sep1 ~by p) *) -let rec sep_rec ~by p = try_ by *> sep1 ~by p - -and sep1 ~by p = +let many1 p = p >>= fun x -> - (sep_rec ~by p >|= fun tl -> x::tl) - <|> return [x] + many p >|= fun l -> x :: l + +(* skip can be made efficient by not allocating intermediate parsers *) +let skip p : _ t = + let rec self = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st _ -> self.run st ~ok ~err) + ~err:(fun _ -> + ok st ()) + } in + self + +let sep_until ~until ~by p = + let rec read_p = lazy ( + p >>= fun x -> + (until *> pure [x]) + <|> + (by *> (Lazy.force read_p >|= fun tl -> x :: tl)) + ) in + (until *> pure []) + <|> (Lazy.force read_p) let sep ~by p = - (try_ p >>= fun x -> - (sep_rec ~by p >|= fun tl -> x::tl) - <|> return [x]) - <|> return [] + let rec read_p = lazy ( + try_or p + ~f:(fun x -> + (eoi *> pure [x]) + <|> + try_or by + ~f:(fun _ -> Lazy.force read_p >|= fun tl -> x :: tl) + (pure [x])) + (pure []) + ) in + Lazy.force read_p -let fix f = - let rec p st ~ok ~err = f p st ~ok ~err in - p +(*$inject + let aword = chars1_if (function 'a'..'z'|'A'..'Z'->true|_ -> false);; +*) +(*$= & ~printer:(errpp Q.Print.(list string)) +(Ok ["a";"b";"c"]) \ + (parse_string (optional (char '/') *> sep ~by:(char '/') aword) "/a/b/c") +(Ok ["a";"b";"c"]) \ + (parse_string (optional (char '/') *> sep ~by:(char '/') aword) "a/b/c") +*) -let memo (type a) (p:a t):a t = - let id = !MemoTbl.id_ in - incr MemoTbl.id_; +let sep1 ~by p = + p >>= fun x -> + sep ~by p >|= fun tl -> + x :: tl + +let line : _ t = { + run=fun st ~ok ~err -> + if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI")) + else ( + match String.index_from st.str st.i '\n' with + | j -> + let s = String.sub st.str st.i (j - st.i) in + ok {st with i=j+1} s + | exception Not_found -> + err (mk_error_ st (const_str_ "unterminated line")) + ) +} + +(*$= + (Ok "1234") (parse_string line "1234\nyolo") + (Ok ("1234", "yolo")) (parse_string (line ||| line) "1234\nyolo\nswag") +*) + +(* parse a string [s] using [p_sub], then parse [s] using [p]. + The result is that of parsing [s] using [p], but the state is + the one after using [p_sub], and errors are translated back into the context + of [p_sub]. + This can be useful for example in [p_sub line some_line_parser]. *) +let parse_sub_ p_sub p : _ t = { + run=fun st0 ~ok ~err -> + let p = p <* eoi in (* make sure [p] reads all *) + p_sub.run st0 + ~ok:(fun st1 s -> + p.run (state_of_string s) + ~ok:(fun _ r -> ok st1 r) + ~err:(fun e -> + err {e with Error.str=st0.str; offset=e.Error.offset + st0.i})) + ~err +} + +let each_line p : _ t = + fix + (fun self -> + try_or eoi + ~f:(fun _ -> pure []) + (parse_sub_ line p >>= fun x -> + self >|= fun tl -> x :: tl)) + +(*$= & ~printer:(errpp Q.Print.(list @@ list int)) + (Ok ([[1;1];[2;2];[3;3]])) \ + (parse_string (each_line (sep ~by:skip_space U.int)) "1 1\n2 2\n3 3\n") +*) + +let memo (type a) (p:a t) : a t = + let id = !Memo_state.id_ in + incr Memo_state.id_; let r = ref None in (* used for universal encoding *) - fun st ~ok ~err -> - let i = st.i in - let (lazy tbl) = st.memo in - try - let f = MemoTbl.H.find tbl (i,id) in - (* extract hidden value *) + + {run=fun st ~ok ~err -> + let tbl = match !(st.memo) with + | Some t -> t + | None -> + let tbl = Memo_tbl.create 32 in + st.memo := Some tbl; + tbl + in + + match r := None; - f (); - begin match !r with - | None -> assert false - | Some (MemoTbl.Ok x) -> ok x - | Some (MemoTbl.Fail e) -> err e - end - with Not_found -> - (* parse, and save *) - p st - ~ok:(fun x -> - MemoTbl.H.replace tbl (i,id) (fun () -> r := Some (MemoTbl.Ok x)); - ok x) - ~err:(fun e -> - MemoTbl.H.replace tbl (i,id) (fun () -> r := Some (MemoTbl.Fail e)); - err e) + let f = Memo_tbl.find tbl (st.i, id) in + f(); + !r + with + | None -> assert false + | Some (Ok (st,x)) -> ok st x + | Some (Error e) -> err e + | exception Not_found -> + (* parse, and save *) + p.run st + ~ok:(fun st' x -> + Memo_tbl.replace tbl (st.i,id) (fun () -> r := Some (Ok (st',x))); + ok st' x) + ~err:(fun e -> + Memo_tbl.replace tbl (st.i,id) (fun () -> r := Some (Error e)); + err e) + } let fix_memo f = - let rec p = - let p' = lazy (memo p) in - fun st ~ok ~err -> f (Lazy.force p') st ~ok ~err - in + let rec p = { + run=fun st ~ok ~err -> (Lazy.force p').run st ~ok ~err + } + and p' = lazy (memo (f p)) in p -let get_lnum = fun st ~ok ~err:_ -> ok st.lnum -let get_cnum = fun st ~ok ~err:_ -> ok st.cnum -let get_pos = fun st ~ok ~err:_ -> ok (st.lnum, st.cnum) +exception ParseError of Error.t -let parse_exn p st = - let res = ref None in - p st ~ok:(fun x -> res := Some x) ~err:(fun e -> raise e); - match !res with - | None -> assert false - | Some x -> x +let stringify_result = function + | Ok _ as x -> x + | Error e -> Error (Error.to_string e) -let exn_to_err e = Error (Printexc.to_string e) +let parse_string_exn p s = + p.run (state_of_string s) + ~ok:(fun _st x -> x) + ~err:(fun e -> raise (ParseError e)) -let parse p st = - try Ok (parse_exn p st) - with e -> exn_to_err e +let parse_string_e p s = + p.run (state_of_string s) + ~ok:(fun _st x -> Ok x) + ~err:(fun e -> Error e) -let parse_string_exn p s = parse_exn p (state_of_string s) - -let parse_string p s = parse p (state_of_string s) +let parse_string p s = parse_string_e p s |> stringify_result let read_all_ ic = let buf = Buffer.create 1024 in @@ -487,20 +697,19 @@ let read_all_ ic = end; Buffer.contents buf -let parse_file_exn p file = +let parse_file_e p file = let ic = open_in file in - let st = state_of_string (read_all_ ic) in - try - let res = parse_exn p st in - close_in ic; - res - with e -> - close_in_noerr ic; - raise e + let s = read_all_ ic in + let r = parse_string_e p s in + close_in ic; + r -let parse_file p file = - try Ok (parse_file_exn p file) - with e -> exn_to_err e +let parse_file p file = parse_file_e p file |> stringify_result + +let parse_file_exn p file = + match parse_file_e p file with + | Ok x -> x + | Error e -> raise (ParseError e) module Infix = struct let (>|=) = (>|=) @@ -509,6 +718,7 @@ module Infix = struct let (<* ) = (<* ) let ( *>) = ( *>) let (<|>) = (<|>) + let (|||) = (|||) let () = () end @@ -517,19 +727,19 @@ module U = struct let list ?(start="[") ?(stop="]") ?(sep=";") p = string start *> skip_white *> - sep_ ~by:(skip_white *> string sep *> skip_white) p <* - skip_white <* string stop - + sep_until + ~until:(skip_white <* string stop) + ~by:(skip_white *> string sep *> skip_white) p let int = skip_white *> - chars1_if (fun c -> is_num c || char_equal c '-') + chars1_if ~descr:"integer" (fun c -> is_num c || char_equal c '-') >>= fun s -> try return (int_of_string s) with Failure _ -> fail "expected an int" let hexa_int = - (try_ (string "0x") <|> return "") *> + (exact "0x" <|> return "") *> begin chars1_if (function '0' .. '9' | 'a'..'f' | 'A'..'F' -> true | _ -> false) >|= fun s -> @@ -583,5 +793,5 @@ end include CCShimsMkLet_.Make(struct type nonrec 'a t = 'a t include Infix - let monoid_product a1 a2 = pure (fun x y ->x,y) <*> a1 <*> a2 + let monoid_product = (|||) end) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 599c066c..1a2d5aa3 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -51,25 +51,34 @@ *) -type 'a or_error = ('a, string) result +(** {2 Errors} + @since NEXT_RELEASE *) +module Error : sig + type t + (** A parse error. + @since NEXT_RELEASE *) -type line_num = int -type col_num = int + val line_and_column : t -> int * int + (** Line and column numbers of the error position. *) -type parse_branch + val msg : t -> string -val string_of_branch : parse_branch -> string + val to_string : t -> string + (** Prints the error *) -exception ParseError of parse_branch * (unit -> string) -(** parsing branch * message. *) + val pp : Format.formatter -> t -> unit + (** Pretty prints the error *) +end + +type 'a or_error = ('a, Error.t) result +(* TODO: use [('a, error) result] instead, with easy conversion to [('a, string) result] *) + +exception ParseError of Error.t (** {2 Input} *) type position - -type state - -val state_of_string : string -> state +(* TODO: make a module Position: sig type t val line : t -> int val col : t -> int *) (** {2 Combinators} *) @@ -111,6 +120,10 @@ val ( *>) : _ t -> 'a t -> 'a t (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The results of [a] is ignored. *) +val (|||) : 'a t -> 'b t -> ('a * 'b) t +(** [a ||| b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) + val fail : string -> 'a t (** [fail msg] fails with the given message. It can trigger a backtrack. *) @@ -127,17 +140,28 @@ val eoi : unit t val nop : unit t (** Succeed with [()]. *) +val any_char : char t +(** [any_char] parses any character. + It still fails if the end of input was reached. + @since NEXT_RELEASE *) + +val any_chars : int -> string t +(** [any_chars len] parses exactly [len] characters from the input. + @since NEXT_RELEASE *) + val char : char -> char t (** [char c] parses the character [c] and nothing else. *) -val char_if : (char -> bool) -> char t -(** [char_if f] parses a character [c] if [f c = true]. *) +val char_if : ?descr:string -> (char -> bool) -> char t +(** [char_if f] parses a character [c] if [f c = true]. + @param descr describes what kind of character was expected *) val chars_if : (char -> bool) -> string t (** [chars_if f] parses a string of chars that satisfy [f]. *) -val chars1_if : (char -> bool) -> string t -(** Like {!chars_if}, but only non-empty strings. *) +val chars1_if : ?descr:string -> (char -> bool) -> string t +(** Like {!chars_if}, but only non-empty strings. + @param descr describes what kind of character was expected *) val endline : char t (** Parse '\n'. *) @@ -180,14 +204,10 @@ val (<|>) : 'a t -> 'a t -> 'a t to avoid wrapping large parsers with {!try_}). *) val () : 'a t -> string -> 'a t -(** [a msg] behaves like [a], but if [a] fails without - consuming any input, it fails with [msg] - instead. Useful as the last choice in a series of [<|>]: - [a <|> b <|> c "expected a|b|c"]. *) - -val try_ : 'a t -> 'a t -(** [try_ p] tries to parse like [p], but backtracks if [p] fails. - Useful in combination with [<|>]. *) +(** [a msg] behaves like [a], but if [a] fails, + [a ]. For example: + [a <|> b <|> c "expected one of a, b, c"]. *) val suspend : (unit -> 'a t) -> 'a t (** [suspend f] is the same as [f ()], but evaluates [f ()] only @@ -196,11 +216,46 @@ val suspend : (unit -> 'a t) -> 'a t val string : string -> string t (** [string s] parses exactly the string [s], and nothing else. *) +val exact : string -> string t +(** Alias to {!string}. + @since NEXT_RELEASE *) + val many : 'a t -> 'a list t -(** [many p] parses a list of [p], eagerly (as long as possible). *) +(** [many p] parses [p] repeatedly, until [p] fails, and + collects the results into a list. *) + +val optional : _ t -> unit t +(** [optional p] tries to parse [p], and return [()] whether it + succeeded or failed. Cannot fail. + @since NEXT_RELEASE *) + +val try_ : 'a t -> 'a option t +(** [try_ p] tries to parse using [p], and return [Some x] if [p] + succeeded with [x]. Otherwise it returns [None]. This cannot fail. + @since NEXT_RELEASE *) + +val many_until : until:_ t -> 'a t -> 'a list t +(** [many_until ~until p] parses as many [p] as it can until + the [until] parser successfully returns. + If [p] fails before that then [many_until ~until p] fails as well. + Typically [until] can be a closing ')' or another termination condition. + @since NEXT_RELEASE *) + +val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t +(** [try_or p1 ~f p2] attempts to parse [x] using [p1], + and then becomes [f x]. + If [p1] fails, then it becomes [p2]. + @since NEXT_RELEASE +*) + +val or_ : 'a t -> 'a t -> 'a t +(** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2] + from the same position. + @since NEXT_RELEASE *) val many1 : 'a t -> 'a list t -(** Parse a non-empty list. *) +(** [many1 p] is like [many p] excepts it fails if the + list is empty (i.e. it needs [p] to succeed at least once). *) val skip : _ t -> unit t (** [skip p] parses zero or more times [p] and ignores its result. *) @@ -208,9 +263,23 @@ val skip : _ t -> unit t val sep : by:_ t -> 'a t -> 'a list t (** [sep ~by p] parses a list of [p] separated by [by]. *) +(* TODO: lookahead? *) + +val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t +(** Same as {!sep} but stop when [until] parses successfully. + @since NEXT_RELEASE *) + val sep1 : by:_ t -> 'a t -> 'a list t (** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *) +val line : string t +(** Parse a line, '\n' excluded. + @since NEXT_RELEASE *) + +val each_line : 'a t -> 'a list t +(** [each_line p] runs [p] on each line of the input. + @since NEXT_RELEASE *) + val fix : ('a t -> 'a t) -> 'a t (** Fixpoint combinator. *) @@ -227,50 +296,45 @@ val memo : 'a t -> 'a t val fix_memo : ('a t -> 'a t) -> 'a t (** Like {!fix}, but the fixpoint is memoized. *) -val get_lnum : int t -(** Reflect the current line number. *) - -val get_cnum : int t -(** Reflect the current column number. *) - -val get_pos : (int * int) t -(** Reflect the current (line, column) numbers. *) - (** {2 Parse} Those functions have a label [~p] on the parser, since 0.14. *) -val parse : 'a t -> state -> 'a or_error -(** [parse p st] applies [p] on the input, and returns [Ok x] if - [p] succeeds with [x], or [Error s] otherwise. *) +val stringify_result : 'a or_error -> ('a, string) result +(** Turn a {!Error.t}-oriented result into a more basic string result. + @since NEXT_RELEASE *) -val parse_exn : 'a t -> state -> 'a -(** Unsafe version of {!parse}. - @raise ParseError if it fails. *) +val parse_string : 'a t -> string -> ('a, string) result +(** Parse a string using the parser. *) -val parse_string : 'a t -> string -> 'a or_error -(** Specialization of {!parse} for string inputs. *) +val parse_string_e : 'a t -> string -> 'a or_error +(** Version of {!parse_string} that returns a more detailed error. *) val parse_string_exn : 'a t -> string -> 'a (** @raise ParseError if it fails. *) -val parse_file : 'a t -> string -> 'a or_error -(** [parse_file p file] parses [file] with [p] by opening the file - and reading it whole. *) +val parse_file : 'a t -> string -> ('a, string) result +(** [parse_file p filename] parses file named [filename] with [p] + by opening the file and reading it whole. *) + +val parse_file_e : 'a t -> string -> 'a or_error +(** Version of {!parse_file} that returns a more detailed error. *) val parse_file_exn : 'a t -> string -> 'a -(** @raise ParseError if it fails. *) +(** Same as {!parse_file}, but + @raise ParseError if it fails. *) (** {2 Infix} *) module Infix : sig val (>|=) : 'a t -> ('a -> 'b) -> 'b t - (** Map. *) + (** Map. [p >|= f] parses an item [x] using [p], + and returns [f x]. *) val (>>=) : 'a t -> ('a -> 'b t) -> 'b t - (** Monadic bind. - [p >>= f] results in a new parser which behaves as [p] then, + (** Monadic bind. + [p >>= f] results in a new parser which behaves as [p] then, in case of success, applies [f] to the result. *) val (<*>) : ('a -> 'b) t -> 'a t -> 'b t @@ -282,20 +346,21 @@ module Infix : sig val ( *>) : _ t -> 'a t -> 'a t (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - results of [a] is ignored. *) + result of [a] is ignored. *) val (<|>) : 'a t -> 'a t -> 'a t - (** [a <|> b] tries to parse [a], and if [a] fails without - consuming any input, backtracks and tries - to parse [b], otherwise it fails as [a]. - See {!try_} to ensure [a] does not consume anything (but it is best - to avoid wrapping large parsers with {!try_}). *) + (** [a <|> b] tries to parse [a], and if [a] fails, it backtracks and tries + to parse [b]. + Alias to {!or_} *) + + val (|||) : 'a t -> 'b t -> ('a * 'b) t + (** [a ||| b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) val () : 'a t -> string -> 'a t - (** [a msg] behaves like [a], but if [a] fails without - consuming any input, it fails with [msg] - instead. Useful as the last choice in a series of [<|>]: - [a <|> b <|> c "expected a|b|c"]. *) + (** [a msg] behaves like [a], but if [a] fails, + it fails with [msg] instead. Useful as the last choice in a series of + [<|>]: [a <|> b <|> c "expected a|b|c"]. *) end @@ -306,9 +371,12 @@ end module U : sig val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t (** [list p] parses a list of [p], with the OCaml conventions for - start token "\[", stop token "\]" and separator ";". + start token "[", stop token "]" and separator ";". Whitespace between items are skipped. *) + (* TODO: parse option? *) + (* TODO: split on whitespace? *) + val int : int t (** Parse an int in decimal representation. *) @@ -320,14 +388,17 @@ module U : sig val word : string t (** Non empty string of alpha num, start with alpha. *) + (* TODO: boolean literal *) + (* TODO: quoted string *) + val pair : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'b t -> ('a * 'b) t - (** Parse a pair using OCaml whitespace conventions. + (** Parse a pair using OCaml syntactic conventions. The default is "(a, b)". *) val triple : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t - (** Parse a triple using OCaml whitespace conventions. + (** Parse a triple using OCaml syntactic conventions. The default is "(a, b, c)". *) end