From 8866f032fe3a4185be57a905420961a722da9a4c Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Thu, 6 Oct 2022 22:03:26 -0400 Subject: [PATCH] feat(parser): move to menhir infix operators need parentheses when nested, there is no precedence yet --- src/parser/ast_term.ml | 111 +++- src/parser/dune | 12 +- src/parser/lex.mll | 50 ++ src/parser/loc.ml | 13 + src/parser/parse.mly | 172 ++++++ src/parser/parse_term.ml | 112 ---- src/parser/parse_term.mli | 9 - src/parser/parse_util.ml | 44 ++ src/parser/parser_comb.ml | 1009 --------------------------------- src/parser/parser_comb.mli | 695 ----------------------- src/parser/position.ml | 26 + src/parser/sidekick_parser.ml | 7 +- 12 files changed, 406 insertions(+), 1854 deletions(-) create mode 100644 src/parser/lex.mll create mode 100644 src/parser/loc.ml create mode 100644 src/parser/parse.mly delete mode 100644 src/parser/parse_term.ml delete mode 100644 src/parser/parse_term.mli create mode 100644 src/parser/parse_util.ml delete mode 100644 src/parser/parser_comb.ml delete mode 100644 src/parser/parser_comb.mli create mode 100644 src/parser/position.ml diff --git a/src/parser/ast_term.ml b/src/parser/ast_term.ml index bfb7a1c8..615fc572 100644 --- a/src/parser/ast_term.ml +++ b/src/parser/ast_term.ml @@ -1,34 +1,54 @@ -module Pos = Parser_comb.Position +type position = Position.t +type loc = Loc.t = { start: position; end_: position } -type position = Pos.t -type loc = { start: position; end_: position } +open struct + let mk_loc = Loc.mk +end -let mk_loc ~start ~end_ : loc = { start; end_ } +type 'a with_loc = { view: 'a; loc: loc } -let pp_loc out (loc : loc) = - Fmt.fprintf out "%a - %a" Pos.pp loc.start Pos.pp loc.end_ - -let loc_merge a b : loc = - { start = Pos.min a.start b.start; end_ = Pos.max a.end_ b.end_ } - -type term = { view: term_view; loc: loc } +type term = term_view with_loc (** Expressions *) and term_view = | Var of string | Int of string | App of term * term list - | Let of (var * term) list * term + | Let of let_binding list * term | Lambda of var list * term | Pi of var list * term | Arrow of term list * term | Error_node of string +and let_binding = var * term + and var = { name: string; ty: term option } (** Variable *) +type decl = decl_view with_loc +(** Declarations *) + +(* TODO: axiom *) +and decl_view = + | D_def of { name: string; args: var list; ty_ret: term option; rhs: term } + | D_hash of string * term + | D_theorem of { name: string; goal: term; proof: proof } + +and proof = proof_view with_loc + +and proof_view = + | P_by of term + | P_exact of term + | P_steps of { + steps: proof_step list; (** intermediate steps *) + ret: proof; (** proof for the result *) + } + +and proof_step = proof_step_view with_loc +and proof_step_view = { name: string; goal: term; proof: proof } + open struct - let mk_ ~loc view : term = { loc; view } + let mk_ ~loc view : _ with_loc = { loc; view } end let view (t : term) = t.view @@ -40,7 +60,7 @@ let mk_app f args : term = if args = [] then f else ( - let loc = List.fold_left (fun loc a -> loc_merge loc a.loc) f.loc args in + let loc = List.fold_left (fun loc a -> Loc.merge loc a.loc) f.loc args in mk_ ~loc (App (f, args)) ) @@ -68,12 +88,25 @@ let mk_let ~loc bs bod : term = if bs = [] then bod else ( - let loc = loc_merge loc bod.loc in + let loc = Loc.merge loc bod.loc in mk_ ~loc (Let (bs, bod)) ) let mk_error ~loc msg : term = mk_ ~loc (Error_node msg) +let decl_def ~loc ?ty_ret name args rhs : decl = + mk_ ~loc @@ D_def { name; args; ty_ret; rhs } + +let decl_hash ~loc s t : decl = mk_ ~loc @@ D_hash (s, t) + +let decl_theorem ~loc name goal proof : decl = + mk_ ~loc @@ D_theorem { name; goal; proof } + +let proof_by ~loc t : proof = mk_ ~loc @@ P_by t +let proof_exact ~loc t : proof = mk_ ~loc @@ P_exact t +let proof_steps ~loc steps ret : proof = mk_ ~loc @@ P_steps { steps; ret } +let step ~loc name goal proof : proof_step = mk_ ~loc @@ { name; goal; proof } + (** Pretty print terms *) let rec pp_term out (e : term) : unit = let pp = pp_term in @@ -84,12 +117,6 @@ let rec pp_term out (e : term) : unit = | Var _ | Error_node _ | Int _ -> pp out e in - let pp_tyvar out x = - match x.ty with - | None -> Fmt.string out x.name - | Some ty -> Fmt.fprintf out "(@[%s : %a@])" x.name pp ty - in - match e.view with | Var v -> Fmt.string out v | Error_node msg -> Fmt.fprintf out "" msg @@ -101,11 +128,47 @@ let rec pp_term out (e : term) : unit = (Util.pp_list ~sep:" -> " pp_sub) args pp_sub ret | Let (bs, bod) -> - let ppb out (x, t) = Fmt.fprintf out "@[<2>%s :=@ %a@]" x.name pp t in - Fmt.fprintf out "@[@[<2>let@ %a@] in@ %a@]" - (Util.pp_list ~sep:"and" ppb) + let ppb out ((x, t) : let_binding) = + Fmt.fprintf out "@[<2>%s :=@ %a@]" x.name pp t + in + Fmt.fprintf out "@[@[<2>let@ @[%a@]@] in@ %a@]" + (Util.pp_list ~sep:" and " ppb) bs pp bod | Lambda (args, bod) -> Fmt.fprintf out "@[lam %a.@ %a@]" (Util.pp_list pp_tyvar) args pp_sub bod | Pi (args, bod) -> Fmt.fprintf out "@[pi %a.@ %a@]" (Util.pp_list pp_tyvar) args pp_sub bod + +and pp_tyvar out (x : var) : unit = + match x.ty with + | None -> Fmt.string out x.name + | Some ty -> Fmt.fprintf out "(@[%s : %a@])" x.name pp_term ty + +let rec pp_proof out (p : proof) : unit = + match p.view with + | P_by t -> Fmt.fprintf out "@[by@ %a@]" pp_term t + | P_exact t -> Fmt.fprintf out "@[exact@ %a@]" pp_term t + | P_steps { steps; ret } -> + Fmt.fprintf out "{@[%a;@ %a@]}" + (Util.pp_list ~sep:";" pp_proof_step) + steps pp_proof ret + +and pp_proof_step out (step : proof_step) : unit = + let s = step.view in + Fmt.fprintf out "@[<2>have %s := %a@ proof %a@]" s.name pp_term s.goal + pp_proof s.proof + +let pp_decl out (d : decl) = + match d.view with + | D_def { name; args; ty_ret; rhs } -> + let pp_tyret out () = + match ty_ret with + | None -> () + | Some ty -> Fmt.fprintf out " @[: %a@]" pp_term ty + in + Fmt.fprintf out "@[<2>def %s%a%a :=@ %a@];" name (Util.pp_list pp_tyvar) + args pp_tyret () pp_term rhs + | D_hash (name, t) -> Fmt.fprintf out "@[<2>#%s@ %a@];" name pp_term t + | D_theorem { name; goal; proof } -> + Fmt.fprintf out "@[theorem %s :=@ %a@ @[proof %a@]@];" name + pp_term goal pp_proof proof diff --git a/src/parser/dune b/src/parser/dune index 94a53ab7..a3284592 100644 --- a/src/parser/dune +++ b/src/parser/dune @@ -1,6 +1,12 @@ (library (name sidekick_parser) - (public_name sidekick.parser) - (synopsis "A term parser") - (libraries sidekick.core sidekick.util) + (public_name sidekick-parser) + (synopsis "High level syntax for sidekick") + (libraries sidekick.core sidekick.util pp_loc) (flags :standard -open Sidekick_util)) + +(ocamllex + (modules lex)) + +(menhir + (modules parse)) diff --git a/src/parser/lex.mll b/src/parser/lex.mll new file mode 100644 index 00000000..34994989 --- /dev/null +++ b/src/parser/lex.mll @@ -0,0 +1,50 @@ + + +{ + +open Parse + +} + +let num = ['0' - '9'] +let alpha = ['a' - 'z' 'A' - 'Z'] +let alphanum = alpha | num +let alphanum_under = alpha | num | ['_'] +let whitespace = [' ' '\t' '\n'] +let sym = '_' | '=' | '+' | '-' | '*' | '^' | '%' | '$' | '#' | '@' | '!' | '>' | '<' | '[' | ']' + +let id = (alpha | '_') alphanum_under* +let int = '-'? num+ +let symbol = sym+ +let hash = '#' alphanum_under+ + +rule token = parse +| whitespace { token lexbuf } +| ";" { SEMICOLON } +| ":" { COLON } +| "(" { LPAREN } +| ")" { RPAREN } +| ":=" { EQDEF } +| "->" { ARROW } +| "." { DOT } +| "pi" { PI } +| "fn" { FUNCTION } +| "let" { LET } +| "in" { IN } +| "and" { AND } +| "have" { HAVE } +| "theorem" { THEOREM } +| "by" { BY } +| "exact" { EXACT } +| "have" { HAVE } +| int { INT(Lexing.lexeme lexbuf) } +| id { ID (Lexing.lexeme lexbuf) } +| symbol { SYMBOL (Lexing.lexeme lexbuf) } +| hash { HASH (Lexing.lexeme lexbuf) } +| eof { EOF } +| _ as c { ERROR(String.make 1 c) } + + + + + diff --git a/src/parser/loc.ml b/src/parser/loc.ml new file mode 100644 index 00000000..f748500c --- /dev/null +++ b/src/parser/loc.ml @@ -0,0 +1,13 @@ +type position = Position.t +type t = { start: position; end_: position } + +let mk ~start ~end_ : t = { start; end_ } + +let pp out (self : t) = + Fmt.fprintf out "%a - %a" Position.pp self.start Position.pp self.end_ + +let merge a b : t = + { start = Position.min a.start b.start; end_ = Position.max a.end_ b.end_ } + +let of_lexloc ((p1, p2) : Lexing.position * Lexing.position) : t = + { start = Position.of_lexpos p1; end_ = Position.of_lexpos p2 } diff --git a/src/parser/parse.mly b/src/parser/parse.mly new file mode 100644 index 00000000..8b7c382c --- /dev/null +++ b/src/parser/parse.mly @@ -0,0 +1,172 @@ + +%{ + +open struct + module A = Ast_term +end + +%} + +%token ID +%token INT +%token SYMBOL +%token HASH + +%token ERROR + +%token LET +%token IN +%token AND +%token EQDEF +%token COLON +%token FUNCTION +%token PI +%token ARROW +%token DOT + +%token SEMICOLON + +%token LPAREN +%token RPAREN +/* TODO: implicit arguments in def */ +%token LBRACE +%token RBRACE + +%token DEF +%token THEOREM +%token BY +%token EXACT +%token HAVE + +%token EOF + +%start top_term +%start top_decl +%start top_decls + +%% + +top_decls: d=decl* EOF { d } +top_decl: d=decl EOF { d } +top_term: t=term EOF { t } + +decl: +| h=HASH t=term SEMICOLON { + let loc = Loc.of_lexloc $loc in + A.decl_hash ~loc h t +} +| DEF name=name args=tyvars* ty_ret=optional_ty EQDEF rhs=term SEMICOLON { + let loc = Loc.of_lexloc $loc in + let args = List.flatten args in + A.decl_def ~loc name args ?ty_ret rhs +} +| THEOREM name=name EQDEF goal=term proof=proof SEMICOLON { + let loc = Loc.of_lexloc $loc in + A.decl_theorem ~loc name goal proof +} + +proof: +| BY t=term { + let loc = Loc.of_lexloc $loc in + A.proof_by ~loc t +} +| EXACT t=term { + let loc = Loc.of_lexloc $loc in + A.proof_exact ~loc t +} +| LBRACE steps=proof_step+ ret=proof RBRACE { + let loc = Loc.of_lexloc $loc in + A.proof_steps ~loc steps ret +} + +proof_step: +| HAVE name=name EQDEF goal=term proof=proof SEMICOLON { + let loc = Loc.of_lexloc $loc in + A.step ~loc name goal proof +} + +tyvar: +| name=name ty=optional_ty { A.var ?ty name } + +tyvars: +| name=name { [A.var name] } +| LPAREN names=name+ COLON ty=term RPAREN { + List.map (fun name -> A.var ~ty name) names +} + +%inline optional_ty: +| { None } +| COLON t=term { Some t } + +term: +| t=binder_term { t } +| LET bs=let_bindings IN rhs=term { + let loc = Loc.of_lexloc $loc in + A.mk_let ~loc bs rhs +} + +let_binding: +| x=tyvar EQDEF t=term {x,t} + +let_bindings: +| b=let_binding { [b] } +| b=let_binding AND l=let_bindings { b::l } + +binder_term: +| t=sym_term { t } +| FUNCTION vars=tyvars+ DOT rhs=binder_term { + let loc = Loc.of_lexloc $loc in + let vars = List.flatten vars in + A.mk_lam ~loc vars rhs +} +| PI vars=tyvars+ DOT rhs=binder_term { + let loc = Loc.of_lexloc $loc in + let vars = List.flatten vars in + A.mk_pi ~loc vars rhs +} + +sym_term: +| t=arrow_term { t } +| t=arrow_term sym=SYMBOL u=arrow_term { + let locsym = Loc.of_lexloc $loc(sym) in + A.mk_app (A.mk_var ~loc:locsym sym) [t;u] +} + +arrow_term: +| t=apply_term { t } +| t=apply_term ARROW u=arrow_term { + let loc = Loc.of_lexloc $loc in + A.mk_arrow ~loc [t] u +} + +apply_term: +| t=atomic_term { t } +| f=atomic_term args=atomic_term+ { + A.mk_app f args +} + +(* TODO: lambda, pi, arrow *) + + +atomic_term: +| v=name { + let loc = Loc.of_lexloc $loc in + A.mk_var ~loc v +} +| i=INT { + let loc = Loc.of_lexloc $loc in + A.mk_int ~loc i +} +| LPAREN t=term RPAREN { t } +| err=ERROR { + let loc = Loc.of_lexloc $loc in + A.mk_error ~loc err +} + +name: +| x=ID { x } + +%% + + + diff --git a/src/parser/parse_term.ml b/src/parser/parse_term.ml deleted file mode 100644 index 5cc6b9cf..00000000 --- a/src/parser/parse_term.ml +++ /dev/null @@ -1,112 +0,0 @@ -module A = Ast_term -module P = Parser_comb -open P.Infix - -let is_alpha = function - | 'a' .. 'z' | 'A' .. 'Z' -> true - | _ -> false - -let is_num = function - | '0' .. '9' -> true - | _ -> false - -let is_alphanum = function - | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' -> true - | _ -> false - -let id : string P.t = - P.chars_fold_transduce `Start ~f:(fun st c -> - match st, c with - | `Start, c when is_alpha c -> `Yield (`Inside, c) - | `Inside, c when is_alphanum c -> `Yield (`Inside, c) - | `Start, _ -> `Fail "expected identifier" - | `Inside, _ -> `Stop) - >|= snd - -let int : string P.t = - P.chars_fold_transduce `Start ~f:(fun st c -> - match st, c with - | `Start, '-' -> `Yield (`Post_neg, c) - | (`Start | `Post_neg | `Inside), c when is_num c -> `Yield (`Inside, c) - | `Start, _ -> `Fail "expected integer" - | `Post_neg, _ -> `Fail "expected a number after '-'" - | `Inside, _ -> `Stop) - >|= snd - -let with_loc (p : 'a P.t) : ('a * A.loc) P.t = - let+ start = P.pos and+ x = p and+ end_ = P.pos in - let loc = A.mk_loc ~start ~end_ in - x, loc - -(* TODO: also skip comments *) -let skip_white : unit P.t = P.skip_white - -let p_var : A.term P.t = - let+ name, loc = with_loc id in - A.mk_var ~loc name - -let p_int : A.term P.t = - let+ x, loc = with_loc int in - A.mk_int ~loc x - -(* main parser *) -let rec p_term () : A.term P.t = - P.suspend @@ fun () -> - P.skip_white - *> (P.try_or_l ~msg:"expected term" - @@ List.flatten - [ - [ - ( P.lookahead_ignore (P.guard (String.equal "let") id), - let+ _id_let, loc = with_loc id - and+ x = skip_white *> id - and+ _ = skip_white *> P.string ":=" - and+ t = p_term () - and+ _ = skip_white *> P.string "in" - and+ bod = p_term () in - assert (_id_let = "let"); - (* TODO: allow [let x : _ := t in bod] *) - let x = A.var x in - A.mk_let ~loc [ x, t ] bod ); - ]; - p_term_atomic_cases () ~f:(fun t -> p_term_apply t []); - ]) - -and p_term_atomic_cases ~f () : _ list = - [ - P.lookahead_ignore id, p_var >>= f; - P.lookahead_ignore int, p_int >>= f; - ( P.lookahead_ignore (P.char '('), - P.char '(' *> p_term () <* skip_white *> P.char ')' >>= f ); - ] - -and p_term_atomic ?else_ ~f () = - P.suspend @@ fun () -> - P.try_or_l ?else_ ~msg:"expected atomic term" @@ p_term_atomic_cases ~f () - -(* TODO: handle infix symbols, with a table (sym -> precedence * parser). - Start by registering "=" and "->" in there. *) -(* TODO: handle lambda and pi *) - -(* parse application of [t] to 0 or more arguments *) -and p_term_apply t args : A.term P.t = - P.suspend @@ fun () -> - let ret = P.suspend @@ fun () -> P.return @@ A.mk_app t (List.rev args) in - skip_white - *> (P.try_or_l ~else_:ret - @@ List.flatten - [ - [ - P.eoi, ret; - ( P.lookahead_ignore (P.guard (fun s -> s = "let" || s = "in") id), - (* if we meet some keyword, we stop *) - ret ); - ]; - p_term_atomic_cases () ~f:(fun a -> p_term_apply t (a :: args)); - ]) - -let p = p_term () -let of_string s = P.parse_string_e p s -let of_string_exn s = P.parse_string_exn p s -let of_string_l s = P.parse_string_e (P.many p) s -let of_string_l_exn s = P.parse_string_exn (P.many p) s diff --git a/src/parser/parse_term.mli b/src/parser/parse_term.mli deleted file mode 100644 index 569ceb59..00000000 --- a/src/parser/parse_term.mli +++ /dev/null @@ -1,9 +0,0 @@ -module A = Ast_term - -val p : A.term Parser_comb.t -(** Term parser *) - -val of_string : string -> A.term Parser_comb.or_error -val of_string_exn : string -> A.term -val of_string_l : string -> A.term list Parser_comb.or_error -val of_string_l_exn : string -> A.term list diff --git a/src/parser/parse_util.ml b/src/parser/parse_util.ml new file mode 100644 index 00000000..23b1a7a7 --- /dev/null +++ b/src/parser/parse_util.ml @@ -0,0 +1,44 @@ +module Error = struct + type t = { loc: Loc.t; msg: string } + + (* TODO: use pp_loc *) + + let mk ~loc msg : t = { msg; loc } + + let pp out (self : t) = + Fmt.fprintf out "parse error: %s@ %a" self.msg Loc.pp self.loc + + let to_string = Fmt.to_string pp +end + +exception Exn_parse_error of Error.t + +open struct + module A = Ast_term + + let guard buf f = + try f () + with Parse.Error -> + let loc = + Loc.of_lexloc (Lexing.lexeme_start_p buf, Lexing.lexeme_end_p buf) + in + raise (Exn_parse_error (Error.mk ~loc @@ "syntax error")) +end + +let term_of_string_exn ?filename (s : string) : A.term = + let buf = Lexing.from_string s in + let@ () = guard buf in + Option.iter (Lexing.set_filename buf) filename; + Parse.top_term Lex.token buf + +let term_of_string ?filename (s : string) : _ result = + try Ok (term_of_string_exn ?filename s) with Exn_parse_error e -> Error e + +let decls_of_string_exn ?filename (s : string) : A.decl list = + let buf = Lexing.from_string s in + let@ () = guard buf in + Option.iter (Lexing.set_filename buf) filename; + Parse.top_decls Lex.token buf + +let decls_of_string ?filename (s : string) : _ result = + try Ok (decls_of_string_exn ?filename s) with Exn_parse_error e -> Error e diff --git a/src/parser/parser_comb.ml b/src/parser/parser_comb.ml deleted file mode 100644 index 515d6b12..00000000 --- a/src/parser/parser_comb.ml +++ /dev/null @@ -1,1009 +0,0 @@ -module Memo_tbl = Hashtbl.Make (struct - type t = int * int (* id of parser, position *) - - let equal ((a, b) : t) (c, d) = a = c && b = d - let hash = Hashtbl.hash -end) - -module Memo_state = struct - (* table of closures, used to implement universal type *) - type t = (unit -> unit) Memo_tbl.t - - (* unique ID for each parser *) - let id_ = ref 0 -end - -(* state common to all parser instances *) -type common_state = { - str: string; - mutable line_offsets: int array option; - mutable memo: Memo_state.t option; -} - -type position = { - pos_cs: common_state; - pos_offset: int; - mutable pos_lc: (int * int) option; -} - -module Position = struct - type t = position - - let compute_line_offsets_ (s : string) : int array = - let lines = Vec.create () in - let i = ref 0 in - Vec.push lines 0; - while !i < String.length s do - match String.index_from s !i '\n' with - | exception Not_found -> i := String.length s - | j -> - Vec.push lines j; - i := j + 1 - done; - Vec.to_array lines - - let line_offsets_ cs = - match cs.line_offsets with - | Some lines -> lines - | None -> - let lines = compute_line_offsets_ cs.str in - cs.line_offsets <- Some lines; - lines - - let int_cmp_ : int -> int -> int = compare - - (* TODO: use pos_cs.line_offsets *) - (* actually re-compute line and column from the buffer *) - let compute_line_and_col_ (cs : common_state) (off : int) : int * int = - let offsets = line_offsets_ cs in - assert (offsets.(0) = 0); - match CCArray.bsearch ~cmp:int_cmp_ off offsets with - | `At 0 -> 0, 0 - | `At n -> n - 1, off - offsets.(n - 1) - 1 - | `Just_after n -> n, off - offsets.(n) - | `Empty -> assert false - | `All_bigger -> assert false (* off >= 0, and offsets[0] == 0 *) - | `All_lower -> - let n = Array.length offsets - 1 in - n, off - offsets.(n) - - let line_and_column self = - match self.pos_lc with - | Some tup -> tup - | None -> - let tup = compute_line_and_col_ self.pos_cs self.pos_offset in - self.pos_lc <- Some tup; - (* save *) - tup - - let line self = fst (line_and_column self) - let column self = snd (line_and_column self) - - let min a b = - if a.pos_offset < b.pos_offset then - a - else - b - - let max a b = - if a.pos_offset > b.pos_offset then - a - else - b - - let pp out self = - let l, c = line_and_column self in - Format.fprintf out "at line %d, column %d" l c -end - -module Error = struct - type t = { msg: unit -> string; pos: position } - - let position self = self.pos - let line_and_column self = Position.line_and_column self.pos - let msg self = self.msg () - - let to_string self = - let line, col = line_and_column self in - Printf.sprintf "at line %d, char %d: %s" line col (self.msg ()) - - let pp out self = - let line, col = line_and_column self in - Format.fprintf out "@[at line %d, char %d:@ %s@]" line col (self.msg ()) -end - -type +'a or_error = ('a, Error.t) result - -type state = { - cs: common_state; - i: int; (* offset in [str] *) - j: int; (* end pointer in [str], excluded. [len = j-i] *) -} -(** Purely functional state passed around *) -(* FIXME: replace memo with: - [global : global_st ref] - - where: - [type global = { - mutable memo: Memo_state.t option; - line_offsets: int Vec.vector; - } - - with line_offsets used to cache the offset where each line begins, - and is computed lazily, to make {!Position.line_and_column} - faster if called many times. -*) - -let[@inline] char_equal (a : char) b = Stdlib.( = ) a b -let string_equal = String.equal -let[@inline] const_str_ x () : string = x - -let state_of_string str = - let s = - { - cs = { str; memo = None; line_offsets = None }; - i = 0; - j = String.length str; - } - in - s - -let[@inline] is_done st = st.i >= st.j -let[@inline] cur st = st.cs.str.[st.i] - -let pos_of_st_ st : position = - { pos_cs = st.cs; pos_offset = st.i; pos_lc = None } - -let mk_error_ st msg : Error.t = { Error.msg; pos = pos_of_st_ st } - -(* consume one char, passing it to [ok]. *) -let consume_ st ~ok ~err = - if is_done st then ( - let msg = const_str_ "unexpected end of input" in - err (mk_error_ st msg) - ) else ( - let c = st.cs.str.[st.i] in - ok { st with i = st.i + 1 } c - ) - -type 'a t = { - run: 'b. state -> ok:(state -> 'a -> 'b) -> err:(Error.t -> 'b) -> 'b; -} -[@@unboxed] -(** Takes the input and two continuations: - {ul - {- [ok] to call with the result and new state when it's done} - {- [err] to call when the parser met an error} - } -*) - -let return x : _ t = { run = (fun st ~ok ~err:_ -> ok st x) } -let pure = return - -let map f (p : 'a t) : _ t = - { run = (fun st ~ok ~err -> p.run st ~ok:(fun st x -> ok st (f x)) ~err) } - -let bind f (p : 'a t) : _ t = - { - run = - (fun st ~ok ~err -> - p.run st - ~ok:(fun st x -> - let p2 = f x in - p2.run st ~ok ~err) - ~err); - } - -let ap (f : _ t) (a : _ t) : _ t = - { - run = - (fun st ~ok ~err -> - f.run st - ~ok:(fun st f -> a.run st ~ok:(fun st x -> ok st (f x)) ~err) - ~err); - } - -let ap_left (a : _ t) (b : _ t) : _ t = - { - run = - (fun st ~ok ~err -> - a.run st ~ok:(fun st x -> b.run st ~ok:(fun st _ -> ok st x) ~err) ~err); - } - -let ap_right (a : _ t) (b : _ t) : _ t = - { - run = - (fun st ~ok ~err -> - a.run st ~ok:(fun st _ -> b.run st ~ok:(fun st x -> ok st x) ~err) ~err); - } - -let or_ (p1 : 'a t) (p2 : 'a t) : _ t = - { - run = (fun st ~ok ~err -> p1.run st ~ok ~err:(fun _e -> p2.run st ~ok ~err)); - } - -let both a b = - { - run = - (fun st ~ok ~err -> - a.run st - ~ok:(fun st xa -> b.run st ~ok:(fun st xb -> ok st (xa, xb)) ~err) - ~err); - } - -let set_error_message msg (p : 'a t) : _ t = - { - run = - (fun st ~ok ~err -> - p.run st ~ok ~err:(fun _e -> err (mk_error_ st (const_str_ msg)))); - } - -module Infix = struct - let[@inline] ( >|= ) p f = map f p - let[@inline] ( >>= ) p f = bind f p - let ( <*> ) = ap - let ( <* ) = ap_left - let ( *> ) = ap_right - let ( <|> ) = or_ - let ( ||| ) = both - let[@inline] ( ) p msg = set_error_message msg p - - [@@@ifge 4.8] - - let ( let+ ) = ( >|= ) - let ( let* ) = ( >>= ) - let ( and+ ) = both - let ( and* ) = ( and+ ) - - [@@@endif] -end - -include Infix - -let map2 f x y = pure f <*> x <*> y -let map3 f x y z = pure f <*> x <*> y <*> z - -let junk_ (st : state) : state = - assert (st.i < st.j); - { st with i = st.i + 1 } - -let eoi = - { - run = - (fun st ~ok ~err -> - if is_done st then - ok st () - else - err (mk_error_ st (const_str_ "expected end of input"))); - } - -let with_pos p : _ t = - { - run = - (fun st ~ok ~err -> - p.run st ~ok:(fun st' x -> ok st' (x, pos_of_st_ st)) ~err); - } - -let pos : _ t = { run = (fun st ~ok ~err:_ -> ok st (pos_of_st_ st)) } - -(* a slice is just a state, which makes {!recurse} quite easy. *) -type slice = state - -module Slice = struct - type t = slice - - let length sl = sl.j - sl.i - let is_empty sl = sl.i = sl.j - let to_string sl = String.sub sl.cs.str sl.i (length sl) -end - -let recurse slice p : _ t = - { - run = - (fun _st ~ok ~err -> - (* make sure these states are related. all slices share the - same reference as the initial state they derive from. *) - assert (CCShims_.Stdlib.(_st.cs == slice.cs)); - p.run slice ~ok ~err); - } - -let all = - { - run = - (fun st ~ok ~err:_ -> - if is_done st then - ok st st - else ( - let st_done = { st with i = st.j } in - ok st_done st - )); - } - -let all_str = all >|= Slice.to_string - -let fail msg : _ t = - { run = (fun st ~ok:_ ~err -> err (mk_error_ st (const_str_ msg))) } - -let failf msg = Printf.ksprintf fail msg -let fail_lazy msg = { run = (fun st ~ok:_ ~err -> err (mk_error_ st msg)) } - -let parsing what p = - { - run = - (fun st ~ok ~err -> - p.run st ~ok ~err:(fun e -> - let msg () = - Printf.sprintf "while parsing %s:\n%s" what (e.Error.msg ()) - in - err { e with Error.msg })); - } - -let empty = { run = (fun st ~ok ~err:_ -> ok st ()) } -let nop = empty -let any_char = { run = (fun st ~ok ~err -> consume_ st ~ok ~err) } - -let char c : _ t = - { - run = - (fun st ~ok ~err -> - consume_ st - ~ok:(fun st c2 -> - if char_equal c c2 then - ok st c - else ( - let msg () = Printf.sprintf "expected '%c', got '%c'" c c2 in - err (mk_error_ st msg) - )) - ~err); - } - -let char_if ?descr p = - { - run = - (fun st ~ok ~err -> - consume_ st - ~ok:(fun st c -> - if p c then - ok st c - else ( - let msg () = - let rest = - match descr with - | None -> "" - | Some d -> Printf.sprintf ", expected %s" d - in - Printf.sprintf "unexpected char '%c'%s" c rest - in - err (mk_error_ st msg) - )) - ~err); - } - -let take_if p : slice t = - { - run = - (fun st ~ok ~err:_ -> - let i = ref st.i in - while - let st = { st with i = !i } in - (not (is_done st)) && p (cur st) - do - incr i - done; - ok { st with i = !i } { st with j = !i }); - } - -let take1_if ?descr p = - take_if p >>= fun sl -> - if Slice.is_empty sl then ( - let msg () = - let what = - match descr with - | None -> "" - | Some d -> Printf.sprintf " for %s" d - in - Printf.sprintf "expected non-empty sequence of chars%s" what - in - fail_lazy msg - ) else - return sl - -let chars_if p = take_if p >|= Slice.to_string - -let chars1_if ?descr p = - { - run = - (fun st ~ok ~err -> - (chars_if p).run st - ~ok:(fun st s -> - if string_equal s "" then ( - let msg () = - let what = - match descr with - | None -> "" - | Some d -> Printf.sprintf " for %s" d - in - Printf.sprintf "expected non-empty sequence of chars%s" what - in - err (mk_error_ st msg) - ) else - ok st s) - ~err); - } - -let guard ?descr f p = - { - run = - (fun st ~ok ~err -> - p.run st - ~ok:(fun st x -> - if f x then - ok st x - else ( - let msg () = - match descr with - | None -> "guard failed" - | Some s -> Printf.sprintf "guard failed: %s" s - in - err (mk_error_ st msg) - )) - ~err); - } - -exception Fold_fail of state * string - -let chars_fold ~f acc0 = - { - run = - (fun st ~ok ~err -> - let i0 = st.i in - let i = ref i0 in - let acc = ref acc0 in - let continue = ref true in - try - while !continue do - let st = { st with i = !i } in - if is_done st then - continue := false - else ( - let c = cur st in - match f !acc c with - | `Continue acc' -> - incr i; - acc := acc' - | `Stop a -> - acc := a; - continue := false - | `Consume_and_stop a -> - acc := a; - incr i; - continue := false - | `Fail msg -> raise (Fold_fail (st, msg)) - ) - done; - ok { st with i = !i } (!acc, { st with j = !i }) - with Fold_fail (st, msg) -> err (mk_error_ st (const_str_ msg))); - } - -let chars_fold_transduce ~f acc0 = - { - run = - (fun st ~ok ~err -> - let i0 = st.i in - let i = ref i0 in - let acc = ref acc0 in - let continue = ref true in - let buf = Buffer.create 16 in - try - while !continue do - let st = { st with i = !i } in - if is_done st then - continue := false - else ( - let c = cur st in - match f !acc c with - | `Continue acc' -> - incr i; - acc := acc' - | `Yield (acc', c') -> - incr i; - acc := acc'; - Buffer.add_char buf c' - | `Stop -> continue := false - | `Consume_and_stop -> - incr i; - continue := false - | `Fail msg -> raise (Fold_fail (st, msg)) - ) - done; - ok { st with i = !i } (!acc, Buffer.contents buf) - with Fold_fail (st, msg) -> err (mk_error_ st (const_str_ msg))); - } - -let skip_chars p : _ t = - let rec self = - { - run = - (fun st ~ok ~err -> - if (not (is_done st)) && p (cur st) then ( - let st = junk_ st in - self.run st ~ok ~err - ) else - ok st ()); - } - in - self - -let is_alpha = function - | 'a' .. 'z' | 'A' .. 'Z' -> true - | _ -> false - -let is_num = function - | '0' .. '9' -> true - | _ -> false - -let is_alpha_num = function - | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' -> true - | _ -> false - -let is_space = function - | ' ' | '\t' -> true - | _ -> false - -let is_white = function - | ' ' | '\t' | '\n' -> true - | _ -> false - -let space = char_if is_space -let white = char_if is_white - -let endline = - char_if ~descr:"end-of-line ('\\n')" (function - | '\n' -> true - | _ -> false) - -let skip_space = skip_chars is_space -let skip_white = skip_chars is_white - -let try_or p1 ~f ~else_:p2 = - { - run = - (fun st ~ok ~err -> - p1.run st - ~ok:(fun st x -> (f x).run st ~ok ~err) - ~err:(fun _ -> p2.run st ~ok ~err)); - } - -let try_or_l ?(msg = "try_or_l ran out of options") ?else_ l : _ t = - { - run = - (fun st ~ok ~err -> - let rec loop = function - | (test, p) :: tl -> - test.run st - ~ok:(fun _ _ -> p.run st ~ok ~err) (* commit *) - ~err:(fun _ -> loop tl) - | [] -> - (match else_ with - | None -> err (mk_error_ st (const_str_ msg)) - | Some p -> p.run st ~ok ~err) - in - loop l); - } - -let suspend f = - { - run = - (fun st ~ok ~err -> - let p = f () in - p.run st ~ok ~err); - } - -(* read [len] chars at once *) -let take len : slice t = - { - run = - (fun st ~ok ~err -> - if st.i + len <= st.j then ( - let slice = { st with j = st.i + len } in - let st = { st with i = st.i + len } in - ok st slice - ) else ( - let msg () = - Printf.sprintf "expected to be able to consume %d chars" len - in - err (mk_error_ st msg) - )); - } - -let any_char_n len : _ t = take len >|= Slice.to_string - -let exact s = - { - run = - (fun st ~ok ~err -> - (* parse a string of length [String.length s] and compare with [s] *) - (any_char_n (String.length s)).run st - ~ok:(fun st s2 -> - if string_equal s s2 then - ok st s - else ( - let msg () = Printf.sprintf "expected %S, got %S" s s2 in - err (mk_error_ st msg) - )) - ~err); - } - -let string = exact - -let fix f = - let rec self = - { run = (fun st ~ok ~err -> (Lazy.force f_self).run st ~ok ~err) } - and f_self = lazy (f self) in - self - -let try_ p = p - -let try_opt p : _ t = - { - run = - (fun st ~ok ~err:_ -> - p.run st ~ok:(fun st x -> ok st (Some x)) ~err:(fun _ -> ok st None)); - } - -let optional p : _ t = - { - run = - (fun st ~ok ~err:_ -> - p.run st ~ok:(fun st _x -> ok st ()) ~err:(fun _ -> ok st ())); - } - -let many_until ~until p : _ t = - fix (fun self -> - try_or until - ~f:(fun _ -> pure []) - ~else_: - ( p >>= fun x -> - self >|= fun l -> x :: l )) - -let many p : _ t = - fix (fun self -> - try_or p ~f:(fun x -> self >|= fun tl -> x :: tl) ~else_:(pure [])) - -let many1 p = - p >>= fun x -> - many p >|= fun l -> x :: l - -(* skip can be made efficient by not allocating intermediate parsers *) -let skip p : _ t = - let rec self = - { - run = - (fun st ~ok ~err -> - p.run st - ~ok:(fun st _ -> self.run st ~ok ~err) - ~err:(fun _ -> ok st ())); - } - in - self - -let sep_until ~until ~by p = - let rec read_p = - lazy - ( p >>= fun x -> - until *> pure [ x ] <|> by *> (Lazy.force read_p >|= fun tl -> x :: tl) - ) - in - until *> pure [] <|> Lazy.force read_p - -let sep ~by p = - let rec read_p = - lazy - (try_or p - ~f:(fun x -> - eoi *> pure [ x ] - <|> try_or by - ~f:(fun _ -> Lazy.force read_p >|= fun tl -> x :: tl) - ~else_:(pure [ x ])) - ~else_:(pure [])) - in - Lazy.force read_p - -let sep1 ~by p = - p >>= fun x -> - sep ~by p >|= fun tl -> x :: tl - -let lookahead p : _ t = - { - run = - (fun st ~ok ~err -> - p.run st ~ok:(fun _st x -> ok st x) (* discard p's new state *) ~err); - } - -let lookahead_ignore p : _ t = - { run = (fun st ~ok ~err -> p.run st ~ok:(fun _st _x -> ok st ()) ~err) } - -let set_current_slice sl : _ t = - { - run = - (fun _st ~ok ~err:_ -> - assert (CCShims_.Stdlib.(_st.cs == sl.cs)); - ok sl ()) - (* jump to slice *); - } - -let split_1 ~on_char : _ t = - { - run = - (fun st ~ok ~err:_ -> - if st.i >= st.j then - ok st (st, None) - else ( - match String.index_from st.cs.str st.i on_char with - | j -> - let x = { st with j } in - let y = { st with i = min st.j (j + 1) } in - let st_done = { st with i = st.j } in - (* empty *) - ok st_done (x, Some y) - | exception Not_found -> - let st_done = { st with i = st.j } in - (* empty *) - ok st_done (st, None) - )); - } - -let split_list_at_most ~on_char n : slice list t = - let rec loop acc n = - if n <= 0 then - (* add the rest to [acc] *) - all >|= fun rest -> - let acc = rest :: acc in - List.rev acc - else - try_or eoi ~f:(fun _ -> return (List.rev acc)) ~else_:(parse_1 acc n) - and parse_1 acc n = - split_1 ~on_char >>= fun (sl1, rest) -> - let acc = sl1 :: acc in - match rest with - | None -> return (List.rev acc) - | Some rest -> recurse rest (loop acc (n - 1)) - in - loop [] n - -let split_list ~on_char : _ t = split_list_at_most ~on_char max_int - -let split_2 ~on_char : _ t = - split_list_at_most ~on_char 3 >>= function - | [ a; b ] -> return (a, b) - | _ -> fail "split_2: expected 2 fields exactly" - -let split_3 ~on_char : _ t = - split_list_at_most ~on_char 4 >>= function - | [ a; b; c ] -> return (a, b, c) - | _ -> fail "split_3: expected 3 fields exactly" - -let split_4 ~on_char : _ t = - split_list_at_most ~on_char 5 >>= function - | [ a; b; c; d ] -> return (a, b, c, d) - | _ -> fail "split_4: expected 4 fields exactly" - -let split_list ~on_char : slice list t = - let rec loop acc = - try_or eoi ~f:(fun _ -> return (List.rev acc)) ~else_:(parse_1 acc) - and parse_1 acc = - split_1 ~on_char >>= fun (sl1, rest) -> - let acc = sl1 :: acc in - match rest with - | None -> return (List.rev acc) - | Some rest -> recurse rest (loop acc) - in - loop [] - -let each_split ~on_char p : 'a list t = - let rec loop acc = - split_1 ~on_char >>= fun (sl1, rest) -> - (* parse [sl1] with [p] *) - recurse sl1 p >>= fun x -> - let acc = x :: acc in - match rest with - | None -> return (List.rev acc) - | Some rest -> recurse rest (loop acc) - in - loop [] - -let line : slice t = - split_1 ~on_char:'\n' >>= fun (sl, rest) -> - match rest with - | None -> return sl - | Some rest -> set_current_slice rest >|= fun () -> sl - -let line_str = line >|= Slice.to_string -let each_line p : _ t = each_split ~on_char:'\n' p - -let memo (type a) (p : a t) : a t = - let id = !Memo_state.id_ in - incr Memo_state.id_; - let r = ref None in - - (* used for universal encoding *) - { - run = - (fun st ~ok ~err -> - let tbl = - match st.cs.memo with - | Some t -> t - | None -> - let tbl = Memo_tbl.create 32 in - st.cs.memo <- Some tbl; - tbl - in - - match - r := None; - let f = Memo_tbl.find tbl (st.i, id) in - f (); - !r - with - | None -> assert false - | Some (Ok (st, x)) -> ok st x - | Some (Error e) -> err e - | exception Not_found -> - (* parse, and save *) - p.run st - ~ok:(fun st' x -> - Memo_tbl.replace tbl (st.i, id) (fun () -> - r := Some (Ok (st', x))); - ok st' x) - ~err:(fun e -> - Memo_tbl.replace tbl (st.i, id) (fun () -> r := Some (Error e)); - err e)); - } - -let fix_memo f = - let rec p = { run = (fun st ~ok ~err -> (Lazy.force p').run st ~ok ~err) } - and p' = lazy (memo (f p)) in - p - -exception ParseError of Error.t - -let stringify_result = function - | Ok _ as x -> x - | Error e -> Error (Error.to_string e) - -let parse_string_exn p s = - p.run (state_of_string s) - ~ok:(fun _st x -> x) - ~err:(fun e -> raise (ParseError e)) - -let parse_string_e p s = - p.run (state_of_string s) ~ok:(fun _st x -> Ok x) ~err:(fun e -> Error e) - -let parse_string p s = parse_string_e p s |> stringify_result - -let read_all_ ic = - let buf = Buffer.create 1024 in - (try - while true do - let line = input_line ic in - Buffer.add_string buf line; - Buffer.add_char buf '\n' - done; - assert false - with End_of_file -> ()); - Buffer.contents buf - -let parse_file_e p file = - let ic = open_in file in - let s = read_all_ ic in - let r = parse_string_e p s in - close_in ic; - r - -let parse_file p file = parse_file_e p file |> stringify_result - -let parse_file_exn p file = - match parse_file_e p file with - | Ok x -> x - | Error e -> raise (ParseError e) - -module U = struct - let sep_ = sep - - let list ?(start = "[") ?(stop = "]") ?(sep = ";") p = - string start *> skip_white - *> sep_until - ~until:(skip_white <* string stop) - ~by:(skip_white *> string sep *> skip_white) - p - - let int = - skip_white - *> chars1_if ~descr:"integer" (fun c -> is_num c || char_equal c '-') - >>= fun s -> - try return (int_of_string s) with Failure _ -> fail "expected an int" - - let in_paren (p : 'a t) : 'a t = - skip_white *> (char '(' *> skip_white *> p <* skip_white <* char ')') - - let in_parens_opt (p : 'a t) : 'a t = - fix (fun self -> - skip_white - *> try_or (char '(') - ~f:(fun _ -> skip_white *> self <* skip_white <* char ')') - ~else_:p) - - let option p = - skip_white - *> try_or (string "Some") - ~f:(fun _ -> skip_white *> p >|= fun x -> Some x) - ~else_:(string "None" *> return None) - - let hexa_int = - (exact "0x" <|> return "") - *> ( chars1_if (function - | '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' -> true - | _ -> false) - >|= fun s -> - let i = ref 0 in - String.iter - (fun c -> - let n = - match c with - | '0' .. '9' -> Char.code c - Char.code '0' - | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 - | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 - | _ -> assert false - in - i := (!i * 16) + n) - s; - !i ) - - let prepend_str c s = String.make 1 c ^ s - let word = map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) - - let bool = - skip_white - *> (string "true" *> return true <|> string "false" *> return false) - - let pair ?(start = "(") ?(stop = ")") ?(sep = ",") p1 p2 = - skip_white *> string start *> skip_white *> p1 >>= fun x1 -> - skip_white *> string sep *> skip_white *> p2 >>= fun x2 -> - skip_white *> string stop *> return (x1, x2) - - let triple ?(start = "(") ?(stop = ")") ?(sep = ",") p1 p2 p3 = - string start *> skip_white *> p1 >>= fun x1 -> - skip_white *> string sep *> skip_white *> p2 >>= fun x2 -> - skip_white *> string sep *> skip_white *> p3 >>= fun x3 -> - string stop *> return (x1, x2, x3) -end - -module Debug_ = struct - let trace_fail name p = - { - run = - (fun st ~ok ~err -> - p.run st ~ok ~err:(fun e -> - Printf.eprintf "trace %s: fail with %s\n%!" name - (Error.to_string e); - err e)); - } - - let trace_ ~both name ~print p = - { - run = - (fun st ~ok ~err -> - p.run st - ~ok:(fun st x -> - Printf.eprintf "trace %s: parsed %s\n%!" name (print x); - ok st x) - ~err:(fun e -> - if both then - Printf.eprintf "trace %s: fail with %s\n%!" name - (Error.to_string e); - err e)); - } - - let trace_success name ~print p = trace_ ~both:false name ~print p - let trace_success_or_fail name ~print p = trace_ ~both:true name ~print p -end diff --git a/src/parser/parser_comb.mli b/src/parser/parser_comb.mli deleted file mode 100644 index 11b9fa29..00000000 --- a/src/parser/parser_comb.mli +++ /dev/null @@ -1,695 +0,0 @@ -(** Simple Parser Combinators - -*) - -type position -(** A position in the input. Typically it'll point at the {b beginning} of - an error location. *) - -(** {2 Positions in input} - - *) -module Position : sig - type t = position - - val line : t -> int - (** Line number, 0 based *) - - val column : t -> int - (** Column number, 0 based *) - - val line_and_column : t -> int * int - (** Line and column number *) - - val min : t -> t -> t - val max : t -> t -> t - - val pp : Format.formatter -> t -> unit - (** Unspecified pretty-printed version of the position. *) -end - -(** {2 Errors} - *) -module Error : sig - type t - (** A parse error. - *) - - val position : t -> position - (** Returns position of the error *) - - val line_and_column : t -> int * int - (** Line and column numbers of the error position. *) - - val msg : t -> string - - val to_string : t -> string - (** Prints the error *) - - val pp : Format.formatter -> t -> unit - (** Pretty prints the error *) -end - -type +'a or_error = ('a, Error.t) result -(** ['a or_error] is either [Ok x] for some result [x : 'a], - or an error {!Error.t}. - - See {!stringify_result} and {!Error.to_string} to print the - error message. *) - -exception ParseError of Error.t - -(** {2 Input} *) - -(** {2 Combinators} *) - -type 'a t -(** The abstract type of parsers that return a value of type ['a] (or fail). - - @raise ParseError in case of failure. - the type is private. -*) - -val return : 'a -> 'a t -(** Always succeeds, without consuming its input. *) - -val pure : 'a -> 'a t -(** Synonym to {!return}. *) - -val map : ('a -> 'b) -> 'a t -> 'b t -val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t -val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t - -val bind : ('a -> 'b t) -> 'a t -> 'b t -(** [bind f p] results in a new parser which behaves as [p] then, - in case of success, applies [f] to the result. -*) - -val ap : ('a -> 'b) t -> 'a t -> 'b t -(** Applicative. - *) - -val eoi : unit t -(** Expect the end of input, fails otherwise. *) - -val empty : unit t -(** Succeed with [()]. - *) - -val fail : string -> 'a t -(** [fail msg] fails with the given message. It can trigger a backtrack. *) - -val failf : ('a, unit, string, 'b t) format4 -> 'a -(** [Format.sprintf] version of {!fail}. *) - -val fail_lazy : (unit -> string) -> 'a t -(** Like {!fail}, but only produce an error message on demand. - *) - -val parsing : string -> 'a t -> 'a t -(** [parsing s p] behaves the same as [p], with the information that - we are parsing [s], if [p] fails. - The message [s] is added to the error, it does not replace it, - not does the location change (the error still points to - the same location as in [p]). *) - -val set_error_message : string -> 'a t -> 'a t -(** [set_error_message msg p] behaves like [p], but if [p] fails, - [set_error_message msg p] fails with [msg] instead and at the current - position. The internal error message of [p] is just discarded. - *) - -val pos : position t -(** [pos] returns the current position in the buffer. - - *) - -val with_pos : 'a t -> ('a * position) t -(** [with_pos p] behaves like [p], but returns the (starting) position - along with [p]'s result. - - *) - -val any_char : char t -(** [any_char] parses any character. - It still fails if the end of input was reached. - *) - -val any_char_n : int -> string t -(** [any_char_n len] parses exactly [len] characters from the input. - Fails if the input doesn't contain at least [len] chars. - *) - -val char : char -> char t -(** [char c] parses the character [c] and nothing else. *) - -type slice -(** A slice of the input, as returned by some combinators such - as {!split_1} or {!split_list} or {!take}. - - The idea is that one can use some parsers to cut the input into slices, - e.g. split into lines, or split a line into fields (think CSV or TSV). - Then a variety of parsers can be used on each slice to extract data from - it using {!recurse}. - - Slices contain enough information to make it possible - for [recurse slice p] to report failures (if [p] fails) using locations - from the original input, not relative to the slice. - Therefore, even after splitting the input into lines using, say, {!each_line}, - a failure to parse the 500th line will be reported at line 500 and - not at line 1. - - *) - -(** Functions on slices. - *) -module Slice : sig - type t = slice - - val is_empty : t -> bool - (** Is the slice empty? *) - - val length : t -> int - (** Length of the slice *) - - val to_string : t -> string - (** Convert the slice into a string. - Linear time and memory in [length slice] *) -end - -val recurse : slice -> 'a t -> 'a t -(** [recurse slice p] parses the [slice] - (most likely obtained via another combinator, such as {!split_1} - or {!split_n}), using [p]. - - The slice contains a position which is used to relocate error - messages to their position in the whole input, not just relative to - the slice. - - *) - -val set_current_slice : slice -> unit t -(** [set_current_slice slice] replaces the parser's state with [slice]. - - *) - -val chars_fold : - f: - ('acc -> - char -> - [ `Continue of 'acc - | `Consume_and_stop of 'acc - | `Stop of 'acc - | `Fail of string ]) -> - 'acc -> - ('acc * slice) t -(** [chars_fold f acc0] folds over characters of the input. - Each char [c] is passed, along with the current accumulator, to [f]; - [f] can either: - - - stop, by returning [`Stop acc]. In this case the final accumulator [acc] - is returned, and [c] is not consumed. - - consume char and stop, by returning [`Consume_and_stop acc]. - - fail, by returning [`Fail msg]. In this case the parser fails - with the given message. - - continue, by returning [`Continue acc]. The parser continues to the - next char with the new accumulator. - - This is a generalization of of {!chars_if} that allows one to transform - characters on the fly, skip some, handle escape sequences, etc. - It can also be useful as a base component for a lexer. - - @return a pair of the final accumular, and the slice matched by the fold. - *) - -val chars_fold_transduce : - f: - ('acc -> - char -> - [ `Continue of 'acc - | `Yield of 'acc * char - | `Consume_and_stop - | `Stop - | `Fail of string ]) -> - 'acc -> - ('acc * string) t -(** Same as {!char_fold} but with the following differences: - - - returns a string along with the accumulator, rather than the slice - of all the characters accepted by [`Continue _]. - The string is built from characters returned by [`Yield]. - - new case [`Yield (acc, c)] adds [c] to the returned string - and continues parsing with [acc]. - - *) - -val guard : ?descr:string -> ('a -> bool) -> 'a t -> 'a t -(** [guard f p] is like [p], but fails if the value returned by [p] - does not satisfy [f]. - @param descr used to provide a better error message *) - -val take : int -> slice t -(** [take len] parses exactly [len] characters from the input. - Fails if the input doesn't contain at least [len] chars. - *) - -val take_if : (char -> bool) -> slice t -(** [take_if f] takes characters as long as they satisfy the predicate [f]. - *) - -val take1_if : ?descr:string -> (char -> bool) -> slice t -(** [take1_if f] takes characters as long as they satisfy the predicate [f]. - Fails if no character satisfies [f]. - @param descr describes what kind of character was expected, in case of error - *) - -val char_if : ?descr:string -> (char -> bool) -> char t -(** [char_if f] parses a character [c] if [f c = true]. - Fails if the next char does not satisfy [f]. - @param descr describes what kind of character was expected, in case of error *) - -val chars_if : (char -> bool) -> string t -(** [chars_if f] parses a string of chars that satisfy [f]. - Cannot fail. *) - -val chars1_if : ?descr:string -> (char -> bool) -> string t -(** Like {!chars_if}, but accepts only non-empty strings. - [chars1_if p] fails if the string accepted by [chars_if p] is empty. - [chars1_if p] is equivalent to [take1_if p >|= Slice.to_string]. - @param descr describes what kind of character was expected, in case of error *) - -val endline : char t -(** Parse '\n'. *) - -val space : char t -(** Tab or space. *) - -val white : char t -(** Tab or space or newline. *) - -val skip_chars : (char -> bool) -> unit t -(** Skip 0 or more chars satisfying the predicate. *) - -val skip_space : unit t -(** Skip ' ' and '\t'. *) - -val skip_white : unit t -(** Skip ' ' and '\t' and '\n'. *) - -val is_alpha : char -> bool -(** Is the char a letter? *) - -val is_num : char -> bool -(** Is the char a digit? *) - -val is_alpha_num : char -> bool -(** Is the char a letter or a digit? *) - -val is_space : char -> bool -(** True on ' ' and '\t'. *) - -val is_white : char -> bool -(** True on ' ' and '\t' and '\n'. *) - -val suspend : (unit -> 'a t) -> 'a t -(** [suspend f] is the same as [f ()], but evaluates [f ()] only - when needed. - - A practical use case is to implement recursive parsers manually, - as described in {!fix}. The parser is [let rec p () = …], - and [suspend p] can be used in the definition to use [p]. -*) - -val string : string -> string t -(** [string s] parses exactly the string [s], and nothing else. *) - -val exact : string -> string t -(** Alias to {!string}. - *) - -val many : 'a t -> 'a list t -(** [many p] parses [p] repeatedly, until [p] fails, and - collects the results into a list. *) - -val optional : _ t -> unit t -(** [optional p] tries to parse [p], and return [()] whether it - succeeded or failed. Cannot fail itself. - It consumes input if [p] succeeded (as much as [p] consumed), but - consumes not input if [p] failed. - *) - -val try_ : 'a t -> 'a t - [@@deprecated "plays no role anymore, just replace [try foo] with [foo]"] -(** [try_ p] is just like [p] (it used to play a role in backtracking - semantics but no more). - - @deprecated since 3.6 it can just be removed. See {!try_opt} if you want - to detect failure. *) - -val try_opt : 'a t -> 'a option t -(** [try_opt p] tries to parse using [p], and return [Some x] if [p] - succeeded with [x] (and consumes what [p] consumed). - Otherwise it returns [None] and consumes nothing. This cannot fail. - *) - -val many_until : until:_ t -> 'a t -> 'a list t -(** [many_until ~until p] parses as many [p] as it can until - the [until] parser successfully returns. - If [p] fails before that then [many_until ~until p] fails as well. - Typically [until] can be a closing ')' or another termination condition, - and what is consumed by [until] is also consumed by [many_until ~until p]. - - *) - -val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t -(** [try_or p1 ~f ~else_:p2] attempts to parse [x] using [p1], - and then becomes [f x]. - If [p1] fails, then it becomes [p2]. This can be useful if [f] is expensive - but only ever works if [p1] matches (e.g. after an opening parenthesis - or some sort of prefix). - -*) - -val try_or_l : ?msg:string -> ?else_:'a t -> (unit t * 'a t) list -> 'a t -(** [try_or_l ?else_ l] tries each pair [(test, p)] in order. - If the n-th [test] succeeds, then [try_or_l l] behaves like n-th [p], - whether [p] fails or not. If [test] consumes input, the state is restored - before calling [p]. - If they all fail, and [else_] is defined, then it behaves like [else_]. - If all fail, and [else_] is [None], then it fails as well. - - This is a performance optimization compared to {!(<|>)}. We commit to a - branch if the test succeeds, without backtracking at all. - It can also provide better error messages, because failures in the parser - will not be reported as failures in [try_or_l]. - - See {!lookahead_ignore} for a convenient way of writing the test conditions. - - @param msg error message if all options fail - - *) - -val or_ : 'a t -> 'a t -> 'a t -(** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2] - from the same position. - *) - -val both : 'a t -> 'b t -> ('a * 'b) t -(** [both a b] parses [a], then [b], then returns the pair of their results. - *) - -val many1 : 'a t -> 'a list t -(** [many1 p] is like [many p] excepts it fails if the - list is empty (i.e. it needs [p] to succeed at least once). *) - -val skip : _ t -> unit t -(** [skip p] parses zero or more times [p] and ignores its result. - It is eager, meaning it will continue as long as [p] succeeds. - As soon as [p] fails, [skip p] stops consuming any input. *) - -val sep : by:_ t -> 'a t -> 'a list t -(** [sep ~by p] parses a list of [p] separated by [by]. *) - -val sep_until : until:_ t -> by:_ t -> 'a t -> 'a list t -(** Same as {!sep} but stop when [until] parses successfully. - *) - -val sep1 : by:_ t -> 'a t -> 'a list t -(** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *) - -val lookahead : 'a t -> 'a t -(** [lookahead p] behaves like [p], except it doesn't consume any input. - *) - -val lookahead_ignore : 'a t -> unit t -(** [lookahead_ignore p] tries to parse input with [p], - and succeeds if [p] succeeds. However it doesn't consume any input - and returns [()], so in effect its only use-case is to detect - whether [p] succeeds, e.g. in {!try_or_l}. - - *) - -val fix : ('a t -> 'a t) -> 'a t -(** Fixpoint combinator. [fix (fun self -> p)] is the parser [p], - in which [self] refers to the parser [p] itself (which is useful to - parse recursive structures. - - An alternative, manual implementation to [let p = fix (fun self -> q)] - is: - {[ let rec p () = - let self = suspend p in - q - ]} -*) - -val line : slice t -(** Parse a line, ['\n'] excluded, and position the cursor after the ['\n']. - *) - -val line_str : string t -(** [line_str] is [line >|= Slice.to_string]. - It parses the next line and turns the slice into a string. - The state points to the character immediately after the ['\n'] character. - *) - -val each_line : 'a t -> 'a list t -(** [each_line p] runs [p] on each line of the input. - *) - -val split_1 : on_char:char -> (slice * slice option) t -(** [split_1 ~on_char] looks for [on_char] in the input, and returns a - pair [sl1, sl2], where: - - - [sl1] is the slice of the input the precedes the first occurrence - of [on_char], or the whole input if [on_char] cannot be found. - It does not contain [on_char]. - - [sl2] is the slice that comes after [on_char], - or [None] if [on_char] couldn't be found. It doesn't contain the first - occurrence of [on_char] (if any). - - The parser is now positioned at the end of the input. - *) - -val split_list : on_char:char -> slice list t -(** [split_list ~on_char] splits the input on all occurrences of [on_char], - returning a list of slices. - *) - -val split_list_at_most : on_char:char -> int -> slice list t -(** [split_list_at_most ~on_char n] applies [split_1 ~on_char] at most - [n] times, to get a list of [n+1] elements. - The last element might contain [on_char]. This is useful to limit the - amount of work done by {!split_list}. - *) - -val split_2 : on_char:char -> (slice * slice) t -(** [split_2 ~on_char] splits the input into exactly 2 fields, - and fails if the split yields less or more than 2 items. - *) - -val split_3 : on_char:char -> (slice * slice * slice) t -(** See {!split_2} - *) - -val split_4 : on_char:char -> (slice * slice * slice * slice) t -(** See {!split_2} - *) - -val each_split : on_char:char -> 'a t -> 'a list t -(** [split_list_map ~on_char p] uses [split_list ~on_char] to split - the input, then parses each chunk of the input thus obtained using [p]. - - The difference with [sep ~by:(char on_char) p] is that - [sep] calls [p] first, and only tries to find [on_char] after [p] returns. - While it is more flexible, this technique also means [p] has to be careful - not to consume [on_char] by error. - - A useful specialization of this is {!each_line}, which is - basically [each_split ~on_char:'\n' p]. - *) - -val all : slice t -(** [all] returns all the unconsumed input as a slice, and consumes it. - Use {!Slice.to_string} to turn it into a string. - - Note that [lookahead all] can be used to {i peek} at the rest of the input - without consuming anything. - - *) - -val all_str : string t -(** [all_str] accepts all the remaining chars and extracts them into a - string. Similar to {!all} but with a string. - *) - -val memo : 'a t -> 'a t -(** Memoize the parser. [memo p] will behave like [p], but when called - in a state (read: position in input) it has already processed, [memo p] - returns a result directly. The implementation uses an underlying - hashtable. - This can be costly in memory, but improve the run time a lot if there - is a lot of backtracking involving [p]. - - Do not call {!memo} inside other functions, especially with {!(>>=)}, - {!map}, etc. being so prevalent. Instead the correct way to use it - is in a toplevel definition: - - {[ - let my_expensive_parser = memo (foo *> bar >>= fun i -> …) - ]} - - This function is not thread-safe. *) - -val fix_memo : ('a t -> 'a t) -> 'a t -(** Like {!fix}, but the fixpoint is memoized. *) - -(** {2 Infix} *) - -module Infix : sig - val ( >|= ) : 'a t -> ('a -> 'b) -> 'b t - (** Alias to {!map}. [p >|= f] parses an item [x] using [p], - and returns [f x]. *) - - val ( >>= ) : 'a t -> ('a -> 'b t) -> 'b t - (** Alias to {!bind}. - [p >>= f] results in a new parser which behaves as [p] then, - in case of success, applies [f] to the result. *) - - val ( <*> ) : ('a -> 'b) t -> 'a t -> 'b t - (** Applicative. *) - - val ( <* ) : 'a t -> _ t -> 'a t - (** [a <* b] parses [a] into [x], parses [b] and ignores its result, - and returns [x]. *) - - val ( *> ) : _ t -> 'a t -> 'a t - (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - result of [a] is ignored. *) - - val ( <|> ) : 'a t -> 'a t -> 'a t - (** Alias to {!or_}. - - [a <|> b] tries to parse [a], and if [a] fails without - consuming any input, backtracks and tries - to parse [b], otherwise it fails as [a]. *) - - val ( ) : 'a t -> string -> 'a t - (** [a msg] behaves like [a], but if [a] fails, - [a msg] fails with [msg] instead. - Useful as the last choice in a series of [<|>]. For example: - [a <|> b <|> c "expected one of a, b, c"]. *) - - val ( ||| ) : 'a t -> 'b t -> ('a * 'b) t - (** Alias to {!both}. - [a ||| b] parses [a], then [b], then returns the pair of their results. - *) - - val ( let+ ) : 'a t -> ('a -> 'b) -> 'b t - val ( let* ) : 'a t -> ('a -> 'b t) -> 'b t - val ( and+ ) : 'a t -> 'b t -> ('a * 'b) t - val ( and* ) : 'a t -> 'b t -> ('a * 'b) t -end - -include module type of Infix - -(** {2 Parse input} *) - -val stringify_result : 'a or_error -> ('a, string) result -(** Turn a {!Error.t}-oriented result into a more basic string result. - *) - -val parse_string : 'a t -> string -> ('a, string) result -(** Parse a string using the parser. *) - -val parse_string_e : 'a t -> string -> 'a or_error -(** Version of {!parse_string} that returns a more detailed error. *) - -val parse_string_exn : 'a t -> string -> 'a -(** @raise ParseError if it fails. *) - -val parse_file : 'a t -> string -> ('a, string) result -(** [parse_file p filename] parses file named [filename] with [p] - by opening the file and reading it whole. *) - -val parse_file_e : 'a t -> string -> 'a or_error -(** Version of {!parse_file} that returns a more detailed error. *) - -val parse_file_exn : 'a t -> string -> 'a -(** Same as {!parse_file}, but - @raise ParseError if it fails. *) - -(** {2 Utils} - - This is useful to parse OCaml-like values in a simple way. - All the parsers are whitespace-insensitive (they skip whitespace). *) -module U : sig - val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t - (** [list p] parses a list of [p], with the OCaml conventions for - start token "\[", stop token "\]" and separator ";". - Whitespace between items are skipped. *) - - (* TODO: parse option? *) - (* TODO: split on whitespace? *) - - val int : int t - (** Parse an int in decimal representation. *) - - val in_paren : 'a t -> 'a t - (** [in_paren p] parses an opening "(",[p] , and then ")". - *) - - val in_parens_opt : 'a t -> 'a t - (** [in_parens_opt p] parses [p] in an arbitrary number of nested - parenthesis (possibly 0). - *) - - val option : 'a t -> 'a option t - (** [option p] parses "Some " into [Some x] if [p] parses "" into [x], - and parses "None" into [None]. - *) - - val hexa_int : int t - (** Parse an int int hexadecimal format. Accepts an optional [0x] prefix, - and ignores capitalization. - *) - - val word : string t - (** Non empty string of alpha num, start with alpha. *) - - val bool : bool t - (** Accepts "true" or "false" - *) - - (* TODO: quoted string *) - - val pair : - ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'b t -> ('a * 'b) t - (** Parse a pair using OCaml syntactic conventions. - The default is "(a, b)". *) - - val triple : - ?start:string -> - ?stop:string -> - ?sep:string -> - 'a t -> - 'b t -> - 'c t -> - ('a * 'b * 'c) t - (** Parse a triple using OCaml syntactic conventions. - The default is "(a, b, c)". *) -end - -(** Debugging utils. *) -module Debug_ : sig - val trace_fail : string -> 'a t -> 'a t - (** [trace_fail name p] behaves like [p], but prints the error message of [p] - on stderr whenever [p] fails. - @param name used as a prefix of all trace messages. *) - - val trace_success : string -> print:('a -> string) -> 'a t -> 'a t - (** [trace_success name ~print p] behaves like [p], but - prints successful runs of [p] using [print]. *) - - val trace_success_or_fail : string -> print:('a -> string) -> 'a t -> 'a t - (** Trace both error or success *) -end diff --git a/src/parser/position.ml b/src/parser/position.ml new file mode 100644 index 00000000..6b6d527e --- /dev/null +++ b/src/parser/position.ml @@ -0,0 +1,26 @@ +type t = { file: string; line: int; col: int } + +let line self = self.line +let col self = self.col +let ( <= ) x y = x.line < y.line || (x.line = y.line && x.col <= y.col) +let ( < ) x y = x.line < y.line || (x.line = y.line && x.col < y.col) +let ( >= ) a b = b <= a +let ( > ) a b = b < a + +let min a b = + if a <= b then + a + else + b + +let max a b = + if a >= b then + a + else + b + +let pp out self = Format.fprintf out "at line %d, column %d" self.line self.col + +(** Build position from a lexing position *) +let of_lexpos (p : Lexing.position) : t = + { file = p.pos_fname; line = p.pos_lnum; col = p.pos_cnum - p.pos_bol } diff --git a/src/parser/sidekick_parser.ml b/src/parser/sidekick_parser.ml index 150792e9..13fa71e7 100644 --- a/src/parser/sidekick_parser.ml +++ b/src/parser/sidekick_parser.ml @@ -1,3 +1,6 @@ module Ast_term = Ast_term -module Parser_comb = Parser_comb -module Parse_term = Parse_term +module Parse_util = Parse_util +module Parse = Parse +module Lex = Lex +module Loc = Loc +module Position = Position