From 1c2c9deefdee070396b259f4554f043fd02a31b5 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Wed, 5 Oct 2022 23:02:32 -0400 Subject: [PATCH] wip: feat: basic term parser --- src/parser/ast_term.ml | 111 ++++ src/parser/dune | 6 + src/parser/parse_term.ml | 108 ++++ src/parser/parse_term.mli | 9 + src/parser/parser_comb.ml | 1009 +++++++++++++++++++++++++++++++++ src/parser/parser_comb.mli | 695 +++++++++++++++++++++++ src/parser/sidekick_parser.ml | 3 + 7 files changed, 1941 insertions(+) create mode 100644 src/parser/ast_term.ml create mode 100644 src/parser/dune create mode 100644 src/parser/parse_term.ml create mode 100644 src/parser/parse_term.mli create mode 100644 src/parser/parser_comb.ml create mode 100644 src/parser/parser_comb.mli create mode 100644 src/parser/sidekick_parser.ml diff --git a/src/parser/ast_term.ml b/src/parser/ast_term.ml new file mode 100644 index 00000000..ce029f4a --- /dev/null +++ b/src/parser/ast_term.ml @@ -0,0 +1,111 @@ +module Pos = Parser_comb.Position + +type position = Pos.t +type loc = { start: position; end_: position } + +let mk_loc ~start ~end_ : loc = { start; end_ } + +let pp_loc out (loc : loc) = + Fmt.fprintf out "%a - %a" Pos.pp loc.start Pos.pp loc.end_ + +let loc_merge a b : loc = + { start = Pos.min a.start b.start; end_ = Pos.max a.end_ b.end_ } + +type term = { view: term_view; loc: loc } +(** Expressions *) + +and term_view = + | Var of string + | Int of string + | App of term * term list + | Let of (var * term) list * term + | Lambda of var list * term + | Pi of var list * term + | Arrow of term list * term + | Error_node of string + +and var = { name: string; ty: term option } +(** Variable *) + +open struct + let mk_ ~loc view : term = { loc; view } +end + +let view (t : term) = t.view +let loc (t : term) = t.loc +let var ?ty name : var = { name; ty } +let mk_var ~loc v : term = mk_ ~loc (Var v) + +let mk_app f args : term = + if args = [] then + f + else ( + let loc = List.fold_left (fun loc a -> loc_merge loc a.loc) f.loc args in + mk_ ~loc (App (f, args)) + ) + +let mk_arrow ~loc args ret : term = + if args = [] then + ret + else + mk_ ~loc (Arrow (args, ret)) + +let mk_lam ~loc args bod : term = + if args = [] then + bod + else + mk_ ~loc (Lambda (args, bod)) + +let mk_int ~loc x : term = mk_ ~loc (Int x) + +let mk_pi ~loc args bod : term = + if args = [] then + bod + else + mk_ ~loc (Pi (args, bod)) + +let mk_let ~loc bs bod : term = + if bs = [] then + bod + else ( + let loc = loc_merge loc bod.loc in + mk_ ~loc (Let (bs, bod)) + ) + +let mk_error ~loc msg : term = mk_ ~loc (Error_node msg) + +(** Pretty print terms *) +let rec pp_term out (e : term) : unit = + let pp = pp_term in + let pp_sub out e = + match e.view with + | App _ | Arrow _ | Pi _ | Let _ | Lambda _ -> + Fmt.fprintf out "(@[%a@])" pp e + | Var _ | Error_node _ | Int _ -> pp out e + in + + let pp_tyvar out x = + match x.ty with + | None -> Fmt.string out x.name + | Some ty -> Fmt.fprintf out "(@[%s : %a@])" x.name pp ty + in + + match e.view with + | Var v -> Fmt.string out v + | Error_node msg -> Fmt.fprintf out "" msg + | Int i -> Fmt.string out i + | App (f, args) -> + Fmt.fprintf out "@[%a@ %a@]" pp_sub f (Util.pp_list pp_sub) args + | Arrow (args, ret) -> + Fmt.fprintf out "@[%a -> %a@]" + (Util.pp_list ~sep:" -> " pp_sub) + args pp_sub ret + | Let (bs, bod) -> + let ppb out (x, t) = Fmt.fprintf out "@[<2>%s :=@ %a@]" x.name pp t in + Fmt.fprintf out "@[@[<2>let@ %a@] in@ %a@]" + (Util.pp_list ~sep:"and" ppb) + bs pp_sub bod + | Lambda (args, bod) -> + Fmt.fprintf out "@[lam %a.@ %a@]" (Util.pp_list pp_tyvar) args pp_sub bod + | Pi (args, bod) -> + Fmt.fprintf out "@[pi %a.@ %a@]" (Util.pp_list pp_tyvar) args pp_sub bod diff --git a/src/parser/dune b/src/parser/dune new file mode 100644 index 00000000..94a53ab7 --- /dev/null +++ b/src/parser/dune @@ -0,0 +1,6 @@ +(library + (name sidekick_parser) + (public_name sidekick.parser) + (synopsis "A term parser") + (libraries sidekick.core sidekick.util) + (flags :standard -open Sidekick_util)) diff --git a/src/parser/parse_term.ml b/src/parser/parse_term.ml new file mode 100644 index 00000000..3bce1b90 --- /dev/null +++ b/src/parser/parse_term.ml @@ -0,0 +1,108 @@ +module A = Ast_term +module P = Parser_comb +open P.Infix + +let is_alpha = function + | 'a' .. 'z' | 'A' .. 'Z' -> true + | _ -> false + +let is_num = function + | '0' .. '9' -> true + | _ -> false + +let is_alphanum = function + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' -> true + | _ -> false + +let id : string P.t = + P.chars_fold_transduce `Start ~f:(fun st c -> + match st, c with + | `Start, c when is_alpha c -> `Yield (`Inside, c) + | `Inside, c when is_alphanum c -> `Yield (`Inside, c) + | `Start, _ -> `Fail "expected identifier" + | `Inside, _ -> `Stop) + >|= snd + +let int : string P.t = + P.chars_fold_transduce `Start ~f:(fun st c -> + match st, c with + | `Start, '-' -> `Yield (`Post_neg, c) + | (`Start | `Post_neg | `Inside), c when is_num c -> `Yield (`Inside, c) + | `Start, _ -> `Fail "expected integer" + | `Post_neg, _ -> `Fail "expected a number after '-'" + | `Inside, _ -> `Stop) + >|= snd + +let with_loc (p : 'a P.t) : ('a * A.loc) P.t = + let+ start = P.pos and+ x = p and+ end_ = P.pos in + let loc = A.mk_loc ~start ~end_ in + x, loc + +(* TODO: also skip comments *) +let skip_white : unit P.t = P.skip_white + +let p_var : A.term P.t = + let+ name, loc = with_loc id in + A.mk_var ~loc name + +let p_int : A.term P.t = + let+ x, loc = with_loc int in + A.mk_int ~loc x + +(* main parser *) +let rec p_term () : A.term P.t = + P.suspend @@ fun () -> + P.skip_white + *> (P.try_or_l ~msg:"expected term" + @@ List.flatten + [ + [ + ( P.lookahead_ignore (P.guard (String.equal "let") id), + let+ _id_let, loc = with_loc id + and+ x = skip_white *> id + and+ _ = skip_white *> P.string ":=" + and+ t = p_term () + and+ _ = skip_white *> P.string "in" + and+ bod = p_term () in + assert (_id_let = "let"); + (* TODO: allow [let x : _ := t in bod] *) + let x = A.var x in + A.mk_let ~loc [ x, t ] bod ); + ]; + p_term_atomic_cases () ~f:(fun t -> p_term_apply t []); + ]) + +and p_term_atomic_cases ~f () : _ list = + [ + P.lookahead_ignore id, p_var >>= f; + P.lookahead_ignore int, p_int >>= f; + ( P.lookahead_ignore (P.char '('), + P.char '(' *> p_term () <* skip_white *> P.char ')' >>= f ); + ] + +and p_term_atomic ?else_ ~f () = + P.suspend @@ fun () -> + P.try_or_l ?else_ ~msg:"expected atomic term" @@ p_term_atomic_cases ~f () + +(* parse application of [t] to 0 or more arguments *) +and p_term_apply t args : A.term P.t = + P.suspend @@ fun () -> + let ret = P.suspend @@ fun () -> P.return @@ A.mk_app t (List.rev args) in + skip_white + *> (P.try_or_l ~else_:ret + @@ List.flatten + [ + [ + P.eoi, ret; + ( P.lookahead_ignore (P.guard (fun s -> s = "let" || s = "in") id), + (* if we meet some keyword, we stop *) + ret ); + ]; + p_term_atomic_cases () ~f:(fun a -> p_term_apply t (a :: args)); + ]) + +let p = p_term () +let of_string s = P.parse_string_e p s +let of_string_exn s = P.parse_string_exn p s +let of_string_l s = P.parse_string_e (P.many p) s +let of_string_l_exn s = P.parse_string_exn (P.many p) s diff --git a/src/parser/parse_term.mli b/src/parser/parse_term.mli new file mode 100644 index 00000000..569ceb59 --- /dev/null +++ b/src/parser/parse_term.mli @@ -0,0 +1,9 @@ +module A = Ast_term + +val p : A.term Parser_comb.t +(** Term parser *) + +val of_string : string -> A.term Parser_comb.or_error +val of_string_exn : string -> A.term +val of_string_l : string -> A.term list Parser_comb.or_error +val of_string_l_exn : string -> A.term list diff --git a/src/parser/parser_comb.ml b/src/parser/parser_comb.ml new file mode 100644 index 00000000..515d6b12 --- /dev/null +++ b/src/parser/parser_comb.ml @@ -0,0 +1,1009 @@ +module Memo_tbl = Hashtbl.Make (struct + type t = int * int (* id of parser, position *) + + let equal ((a, b) : t) (c, d) = a = c && b = d + let hash = Hashtbl.hash +end) + +module Memo_state = struct + (* table of closures, used to implement universal type *) + type t = (unit -> unit) Memo_tbl.t + + (* unique ID for each parser *) + let id_ = ref 0 +end + +(* state common to all parser instances *) +type common_state = { + str: string; + mutable line_offsets: int array option; + mutable memo: Memo_state.t option; +} + +type position = { + pos_cs: common_state; + pos_offset: int; + mutable pos_lc: (int * int) option; +} + +module Position = struct + type t = position + + let compute_line_offsets_ (s : string) : int array = + let lines = Vec.create () in + let i = ref 0 in + Vec.push lines 0; + while !i < String.length s do + match String.index_from s !i '\n' with + | exception Not_found -> i := String.length s + | j -> + Vec.push lines j; + i := j + 1 + done; + Vec.to_array lines + + let line_offsets_ cs = + match cs.line_offsets with + | Some lines -> lines + | None -> + let lines = compute_line_offsets_ cs.str in + cs.line_offsets <- Some lines; + lines + + let int_cmp_ : int -> int -> int = compare + + (* TODO: use pos_cs.line_offsets *) + (* actually re-compute line and column from the buffer *) + let compute_line_and_col_ (cs : common_state) (off : int) : int * int = + let offsets = line_offsets_ cs in + assert (offsets.(0) = 0); + match CCArray.bsearch ~cmp:int_cmp_ off offsets with + | `At 0 -> 0, 0 + | `At n -> n - 1, off - offsets.(n - 1) - 1 + | `Just_after n -> n, off - offsets.(n) + | `Empty -> assert false + | `All_bigger -> assert false (* off >= 0, and offsets[0] == 0 *) + | `All_lower -> + let n = Array.length offsets - 1 in + n, off - offsets.(n) + + let line_and_column self = + match self.pos_lc with + | Some tup -> tup + | None -> + let tup = compute_line_and_col_ self.pos_cs self.pos_offset in + self.pos_lc <- Some tup; + (* save *) + tup + + let line self = fst (line_and_column self) + let column self = snd (line_and_column self) + + let min a b = + if a.pos_offset < b.pos_offset then + a + else + b + + let max a b = + if a.pos_offset > b.pos_offset then + a + else + b + + let pp out self = + let l, c = line_and_column self in + Format.fprintf out "at line %d, column %d" l c +end + +module Error = struct + type t = { msg: unit -> string; pos: position } + + let position self = self.pos + let line_and_column self = Position.line_and_column self.pos + let msg self = self.msg () + + let to_string self = + let line, col = line_and_column self in + Printf.sprintf "at line %d, char %d: %s" line col (self.msg ()) + + let pp out self = + let line, col = line_and_column self in + Format.fprintf out "@[at line %d, char %d:@ %s@]" line col (self.msg ()) +end + +type +'a or_error = ('a, Error.t) result + +type state = { + cs: common_state; + i: int; (* offset in [str] *) + j: int; (* end pointer in [str], excluded. [len = j-i] *) +} +(** Purely functional state passed around *) +(* FIXME: replace memo with: + [global : global_st ref] + + where: + [type global = { + mutable memo: Memo_state.t option; + line_offsets: int Vec.vector; + } + + with line_offsets used to cache the offset where each line begins, + and is computed lazily, to make {!Position.line_and_column} + faster if called many times. +*) + +let[@inline] char_equal (a : char) b = Stdlib.( = ) a b +let string_equal = String.equal +let[@inline] const_str_ x () : string = x + +let state_of_string str = + let s = + { + cs = { str; memo = None; line_offsets = None }; + i = 0; + j = String.length str; + } + in + s + +let[@inline] is_done st = st.i >= st.j +let[@inline] cur st = st.cs.str.[st.i] + +let pos_of_st_ st : position = + { pos_cs = st.cs; pos_offset = st.i; pos_lc = None } + +let mk_error_ st msg : Error.t = { Error.msg; pos = pos_of_st_ st } + +(* consume one char, passing it to [ok]. *) +let consume_ st ~ok ~err = + if is_done st then ( + let msg = const_str_ "unexpected end of input" in + err (mk_error_ st msg) + ) else ( + let c = st.cs.str.[st.i] in + ok { st with i = st.i + 1 } c + ) + +type 'a t = { + run: 'b. state -> ok:(state -> 'a -> 'b) -> err:(Error.t -> 'b) -> 'b; +} +[@@unboxed] +(** Takes the input and two continuations: + {ul + {- [ok] to call with the result and new state when it's done} + {- [err] to call when the parser met an error} + } +*) + +let return x : _ t = { run = (fun st ~ok ~err:_ -> ok st x) } +let pure = return + +let map f (p : 'a t) : _ t = + { run = (fun st ~ok ~err -> p.run st ~ok:(fun st x -> ok st (f x)) ~err) } + +let bind f (p : 'a t) : _ t = + { + run = + (fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> + let p2 = f x in + p2.run st ~ok ~err) + ~err); + } + +let ap (f : _ t) (a : _ t) : _ t = + { + run = + (fun st ~ok ~err -> + f.run st + ~ok:(fun st f -> a.run st ~ok:(fun st x -> ok st (f x)) ~err) + ~err); + } + +let ap_left (a : _ t) (b : _ t) : _ t = + { + run = + (fun st ~ok ~err -> + a.run st ~ok:(fun st x -> b.run st ~ok:(fun st _ -> ok st x) ~err) ~err); + } + +let ap_right (a : _ t) (b : _ t) : _ t = + { + run = + (fun st ~ok ~err -> + a.run st ~ok:(fun st _ -> b.run st ~ok:(fun st x -> ok st x) ~err) ~err); + } + +let or_ (p1 : 'a t) (p2 : 'a t) : _ t = + { + run = (fun st ~ok ~err -> p1.run st ~ok ~err:(fun _e -> p2.run st ~ok ~err)); + } + +let both a b = + { + run = + (fun st ~ok ~err -> + a.run st + ~ok:(fun st xa -> b.run st ~ok:(fun st xb -> ok st (xa, xb)) ~err) + ~err); + } + +let set_error_message msg (p : 'a t) : _ t = + { + run = + (fun st ~ok ~err -> + p.run st ~ok ~err:(fun _e -> err (mk_error_ st (const_str_ msg)))); + } + +module Infix = struct + let[@inline] ( >|= ) p f = map f p + let[@inline] ( >>= ) p f = bind f p + let ( <*> ) = ap + let ( <* ) = ap_left + let ( *> ) = ap_right + let ( <|> ) = or_ + let ( ||| ) = both + let[@inline] ( ) p msg = set_error_message msg p + + [@@@ifge 4.8] + + let ( let+ ) = ( >|= ) + let ( let* ) = ( >>= ) + let ( and+ ) = both + let ( and* ) = ( and+ ) + + [@@@endif] +end + +include Infix + +let map2 f x y = pure f <*> x <*> y +let map3 f x y z = pure f <*> x <*> y <*> z + +let junk_ (st : state) : state = + assert (st.i < st.j); + { st with i = st.i + 1 } + +let eoi = + { + run = + (fun st ~ok ~err -> + if is_done st then + ok st () + else + err (mk_error_ st (const_str_ "expected end of input"))); + } + +let with_pos p : _ t = + { + run = + (fun st ~ok ~err -> + p.run st ~ok:(fun st' x -> ok st' (x, pos_of_st_ st)) ~err); + } + +let pos : _ t = { run = (fun st ~ok ~err:_ -> ok st (pos_of_st_ st)) } + +(* a slice is just a state, which makes {!recurse} quite easy. *) +type slice = state + +module Slice = struct + type t = slice + + let length sl = sl.j - sl.i + let is_empty sl = sl.i = sl.j + let to_string sl = String.sub sl.cs.str sl.i (length sl) +end + +let recurse slice p : _ t = + { + run = + (fun _st ~ok ~err -> + (* make sure these states are related. all slices share the + same reference as the initial state they derive from. *) + assert (CCShims_.Stdlib.(_st.cs == slice.cs)); + p.run slice ~ok ~err); + } + +let all = + { + run = + (fun st ~ok ~err:_ -> + if is_done st then + ok st st + else ( + let st_done = { st with i = st.j } in + ok st_done st + )); + } + +let all_str = all >|= Slice.to_string + +let fail msg : _ t = + { run = (fun st ~ok:_ ~err -> err (mk_error_ st (const_str_ msg))) } + +let failf msg = Printf.ksprintf fail msg +let fail_lazy msg = { run = (fun st ~ok:_ ~err -> err (mk_error_ st msg)) } + +let parsing what p = + { + run = + (fun st ~ok ~err -> + p.run st ~ok ~err:(fun e -> + let msg () = + Printf.sprintf "while parsing %s:\n%s" what (e.Error.msg ()) + in + err { e with Error.msg })); + } + +let empty = { run = (fun st ~ok ~err:_ -> ok st ()) } +let nop = empty +let any_char = { run = (fun st ~ok ~err -> consume_ st ~ok ~err) } + +let char c : _ t = + { + run = + (fun st ~ok ~err -> + consume_ st + ~ok:(fun st c2 -> + if char_equal c c2 then + ok st c + else ( + let msg () = Printf.sprintf "expected '%c', got '%c'" c c2 in + err (mk_error_ st msg) + )) + ~err); + } + +let char_if ?descr p = + { + run = + (fun st ~ok ~err -> + consume_ st + ~ok:(fun st c -> + if p c then + ok st c + else ( + let msg () = + let rest = + match descr with + | None -> "" + | Some d -> Printf.sprintf ", expected %s" d + in + Printf.sprintf "unexpected char '%c'%s" c rest + in + err (mk_error_ st msg) + )) + ~err); + } + +let take_if p : slice t = + { + run = + (fun st ~ok ~err:_ -> + let i = ref st.i in + while + let st = { st with i = !i } in + (not (is_done st)) && p (cur st) + do + incr i + done; + ok { st with i = !i } { st with j = !i }); + } + +let take1_if ?descr p = + take_if p >>= fun sl -> + if Slice.is_empty sl then ( + let msg () = + let what = + match descr with + | None -> "" + | Some d -> Printf.sprintf " for %s" d + in + Printf.sprintf "expected non-empty sequence of chars%s" what + in + fail_lazy msg + ) else + return sl + +let chars_if p = take_if p >|= Slice.to_string + +let chars1_if ?descr p = + { + run = + (fun st ~ok ~err -> + (chars_if p).run st + ~ok:(fun st s -> + if string_equal s "" then ( + let msg () = + let what = + match descr with + | None -> "" + | Some d -> Printf.sprintf " for %s" d + in + Printf.sprintf "expected non-empty sequence of chars%s" what + in + err (mk_error_ st msg) + ) else + ok st s) + ~err); + } + +let guard ?descr f p = + { + run = + (fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> + if f x then + ok st x + else ( + let msg () = + match descr with + | None -> "guard failed" + | Some s -> Printf.sprintf "guard failed: %s" s + in + err (mk_error_ st msg) + )) + ~err); + } + +exception Fold_fail of state * string + +let chars_fold ~f acc0 = + { + run = + (fun st ~ok ~err -> + let i0 = st.i in + let i = ref i0 in + let acc = ref acc0 in + let continue = ref true in + try + while !continue do + let st = { st with i = !i } in + if is_done st then + continue := false + else ( + let c = cur st in + match f !acc c with + | `Continue acc' -> + incr i; + acc := acc' + | `Stop a -> + acc := a; + continue := false + | `Consume_and_stop a -> + acc := a; + incr i; + continue := false + | `Fail msg -> raise (Fold_fail (st, msg)) + ) + done; + ok { st with i = !i } (!acc, { st with j = !i }) + with Fold_fail (st, msg) -> err (mk_error_ st (const_str_ msg))); + } + +let chars_fold_transduce ~f acc0 = + { + run = + (fun st ~ok ~err -> + let i0 = st.i in + let i = ref i0 in + let acc = ref acc0 in + let continue = ref true in + let buf = Buffer.create 16 in + try + while !continue do + let st = { st with i = !i } in + if is_done st then + continue := false + else ( + let c = cur st in + match f !acc c with + | `Continue acc' -> + incr i; + acc := acc' + | `Yield (acc', c') -> + incr i; + acc := acc'; + Buffer.add_char buf c' + | `Stop -> continue := false + | `Consume_and_stop -> + incr i; + continue := false + | `Fail msg -> raise (Fold_fail (st, msg)) + ) + done; + ok { st with i = !i } (!acc, Buffer.contents buf) + with Fold_fail (st, msg) -> err (mk_error_ st (const_str_ msg))); + } + +let skip_chars p : _ t = + let rec self = + { + run = + (fun st ~ok ~err -> + if (not (is_done st)) && p (cur st) then ( + let st = junk_ st in + self.run st ~ok ~err + ) else + ok st ()); + } + in + self + +let is_alpha = function + | 'a' .. 'z' | 'A' .. 'Z' -> true + | _ -> false + +let is_num = function + | '0' .. '9' -> true + | _ -> false + +let is_alpha_num = function + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' -> true + | _ -> false + +let is_space = function + | ' ' | '\t' -> true + | _ -> false + +let is_white = function + | ' ' | '\t' | '\n' -> true + | _ -> false + +let space = char_if is_space +let white = char_if is_white + +let endline = + char_if ~descr:"end-of-line ('\\n')" (function + | '\n' -> true + | _ -> false) + +let skip_space = skip_chars is_space +let skip_white = skip_chars is_white + +let try_or p1 ~f ~else_:p2 = + { + run = + (fun st ~ok ~err -> + p1.run st + ~ok:(fun st x -> (f x).run st ~ok ~err) + ~err:(fun _ -> p2.run st ~ok ~err)); + } + +let try_or_l ?(msg = "try_or_l ran out of options") ?else_ l : _ t = + { + run = + (fun st ~ok ~err -> + let rec loop = function + | (test, p) :: tl -> + test.run st + ~ok:(fun _ _ -> p.run st ~ok ~err) (* commit *) + ~err:(fun _ -> loop tl) + | [] -> + (match else_ with + | None -> err (mk_error_ st (const_str_ msg)) + | Some p -> p.run st ~ok ~err) + in + loop l); + } + +let suspend f = + { + run = + (fun st ~ok ~err -> + let p = f () in + p.run st ~ok ~err); + } + +(* read [len] chars at once *) +let take len : slice t = + { + run = + (fun st ~ok ~err -> + if st.i + len <= st.j then ( + let slice = { st with j = st.i + len } in + let st = { st with i = st.i + len } in + ok st slice + ) else ( + let msg () = + Printf.sprintf "expected to be able to consume %d chars" len + in + err (mk_error_ st msg) + )); + } + +let any_char_n len : _ t = take len >|= Slice.to_string + +let exact s = + { + run = + (fun st ~ok ~err -> + (* parse a string of length [String.length s] and compare with [s] *) + (any_char_n (String.length s)).run st + ~ok:(fun st s2 -> + if string_equal s s2 then + ok st s + else ( + let msg () = Printf.sprintf "expected %S, got %S" s s2 in + err (mk_error_ st msg) + )) + ~err); + } + +let string = exact + +let fix f = + let rec self = + { run = (fun st ~ok ~err -> (Lazy.force f_self).run st ~ok ~err) } + and f_self = lazy (f self) in + self + +let try_ p = p + +let try_opt p : _ t = + { + run = + (fun st ~ok ~err:_ -> + p.run st ~ok:(fun st x -> ok st (Some x)) ~err:(fun _ -> ok st None)); + } + +let optional p : _ t = + { + run = + (fun st ~ok ~err:_ -> + p.run st ~ok:(fun st _x -> ok st ()) ~err:(fun _ -> ok st ())); + } + +let many_until ~until p : _ t = + fix (fun self -> + try_or until + ~f:(fun _ -> pure []) + ~else_: + ( p >>= fun x -> + self >|= fun l -> x :: l )) + +let many p : _ t = + fix (fun self -> + try_or p ~f:(fun x -> self >|= fun tl -> x :: tl) ~else_:(pure [])) + +let many1 p = + p >>= fun x -> + many p >|= fun l -> x :: l + +(* skip can be made efficient by not allocating intermediate parsers *) +let skip p : _ t = + let rec self = + { + run = + (fun st ~ok ~err -> + p.run st + ~ok:(fun st _ -> self.run st ~ok ~err) + ~err:(fun _ -> ok st ())); + } + in + self + +let sep_until ~until ~by p = + let rec read_p = + lazy + ( p >>= fun x -> + until *> pure [ x ] <|> by *> (Lazy.force read_p >|= fun tl -> x :: tl) + ) + in + until *> pure [] <|> Lazy.force read_p + +let sep ~by p = + let rec read_p = + lazy + (try_or p + ~f:(fun x -> + eoi *> pure [ x ] + <|> try_or by + ~f:(fun _ -> Lazy.force read_p >|= fun tl -> x :: tl) + ~else_:(pure [ x ])) + ~else_:(pure [])) + in + Lazy.force read_p + +let sep1 ~by p = + p >>= fun x -> + sep ~by p >|= fun tl -> x :: tl + +let lookahead p : _ t = + { + run = + (fun st ~ok ~err -> + p.run st ~ok:(fun _st x -> ok st x) (* discard p's new state *) ~err); + } + +let lookahead_ignore p : _ t = + { run = (fun st ~ok ~err -> p.run st ~ok:(fun _st _x -> ok st ()) ~err) } + +let set_current_slice sl : _ t = + { + run = + (fun _st ~ok ~err:_ -> + assert (CCShims_.Stdlib.(_st.cs == sl.cs)); + ok sl ()) + (* jump to slice *); + } + +let split_1 ~on_char : _ t = + { + run = + (fun st ~ok ~err:_ -> + if st.i >= st.j then + ok st (st, None) + else ( + match String.index_from st.cs.str st.i on_char with + | j -> + let x = { st with j } in + let y = { st with i = min st.j (j + 1) } in + let st_done = { st with i = st.j } in + (* empty *) + ok st_done (x, Some y) + | exception Not_found -> + let st_done = { st with i = st.j } in + (* empty *) + ok st_done (st, None) + )); + } + +let split_list_at_most ~on_char n : slice list t = + let rec loop acc n = + if n <= 0 then + (* add the rest to [acc] *) + all >|= fun rest -> + let acc = rest :: acc in + List.rev acc + else + try_or eoi ~f:(fun _ -> return (List.rev acc)) ~else_:(parse_1 acc n) + and parse_1 acc n = + split_1 ~on_char >>= fun (sl1, rest) -> + let acc = sl1 :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc (n - 1)) + in + loop [] n + +let split_list ~on_char : _ t = split_list_at_most ~on_char max_int + +let split_2 ~on_char : _ t = + split_list_at_most ~on_char 3 >>= function + | [ a; b ] -> return (a, b) + | _ -> fail "split_2: expected 2 fields exactly" + +let split_3 ~on_char : _ t = + split_list_at_most ~on_char 4 >>= function + | [ a; b; c ] -> return (a, b, c) + | _ -> fail "split_3: expected 3 fields exactly" + +let split_4 ~on_char : _ t = + split_list_at_most ~on_char 5 >>= function + | [ a; b; c; d ] -> return (a, b, c, d) + | _ -> fail "split_4: expected 4 fields exactly" + +let split_list ~on_char : slice list t = + let rec loop acc = + try_or eoi ~f:(fun _ -> return (List.rev acc)) ~else_:(parse_1 acc) + and parse_1 acc = + split_1 ~on_char >>= fun (sl1, rest) -> + let acc = sl1 :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc) + in + loop [] + +let each_split ~on_char p : 'a list t = + let rec loop acc = + split_1 ~on_char >>= fun (sl1, rest) -> + (* parse [sl1] with [p] *) + recurse sl1 p >>= fun x -> + let acc = x :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc) + in + loop [] + +let line : slice t = + split_1 ~on_char:'\n' >>= fun (sl, rest) -> + match rest with + | None -> return sl + | Some rest -> set_current_slice rest >|= fun () -> sl + +let line_str = line >|= Slice.to_string +let each_line p : _ t = each_split ~on_char:'\n' p + +let memo (type a) (p : a t) : a t = + let id = !Memo_state.id_ in + incr Memo_state.id_; + let r = ref None in + + (* used for universal encoding *) + { + run = + (fun st ~ok ~err -> + let tbl = + match st.cs.memo with + | Some t -> t + | None -> + let tbl = Memo_tbl.create 32 in + st.cs.memo <- Some tbl; + tbl + in + + match + r := None; + let f = Memo_tbl.find tbl (st.i, id) in + f (); + !r + with + | None -> assert false + | Some (Ok (st, x)) -> ok st x + | Some (Error e) -> err e + | exception Not_found -> + (* parse, and save *) + p.run st + ~ok:(fun st' x -> + Memo_tbl.replace tbl (st.i, id) (fun () -> + r := Some (Ok (st', x))); + ok st' x) + ~err:(fun e -> + Memo_tbl.replace tbl (st.i, id) (fun () -> r := Some (Error e)); + err e)); + } + +let fix_memo f = + let rec p = { run = (fun st ~ok ~err -> (Lazy.force p').run st ~ok ~err) } + and p' = lazy (memo (f p)) in + p + +exception ParseError of Error.t + +let stringify_result = function + | Ok _ as x -> x + | Error e -> Error (Error.to_string e) + +let parse_string_exn p s = + p.run (state_of_string s) + ~ok:(fun _st x -> x) + ~err:(fun e -> raise (ParseError e)) + +let parse_string_e p s = + p.run (state_of_string s) ~ok:(fun _st x -> Ok x) ~err:(fun e -> Error e) + +let parse_string p s = parse_string_e p s |> stringify_result + +let read_all_ ic = + let buf = Buffer.create 1024 in + (try + while true do + let line = input_line ic in + Buffer.add_string buf line; + Buffer.add_char buf '\n' + done; + assert false + with End_of_file -> ()); + Buffer.contents buf + +let parse_file_e p file = + let ic = open_in file in + let s = read_all_ ic in + let r = parse_string_e p s in + close_in ic; + r + +let parse_file p file = parse_file_e p file |> stringify_result + +let parse_file_exn p file = + match parse_file_e p file with + | Ok x -> x + | Error e -> raise (ParseError e) + +module U = struct + let sep_ = sep + + let list ?(start = "[") ?(stop = "]") ?(sep = ";") p = + string start *> skip_white + *> sep_until + ~until:(skip_white <* string stop) + ~by:(skip_white *> string sep *> skip_white) + p + + let int = + skip_white + *> chars1_if ~descr:"integer" (fun c -> is_num c || char_equal c '-') + >>= fun s -> + try return (int_of_string s) with Failure _ -> fail "expected an int" + + let in_paren (p : 'a t) : 'a t = + skip_white *> (char '(' *> skip_white *> p <* skip_white <* char ')') + + let in_parens_opt (p : 'a t) : 'a t = + fix (fun self -> + skip_white + *> try_or (char '(') + ~f:(fun _ -> skip_white *> self <* skip_white <* char ')') + ~else_:p) + + let option p = + skip_white + *> try_or (string "Some") + ~f:(fun _ -> skip_white *> p >|= fun x -> Some x) + ~else_:(string "None" *> return None) + + let hexa_int = + (exact "0x" <|> return "") + *> ( chars1_if (function + | '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' -> true + | _ -> false) + >|= fun s -> + let i = ref 0 in + String.iter + (fun c -> + let n = + match c with + | '0' .. '9' -> Char.code c - Char.code '0' + | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 + | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 + | _ -> assert false + in + i := (!i * 16) + n) + s; + !i ) + + let prepend_str c s = String.make 1 c ^ s + let word = map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) + + let bool = + skip_white + *> (string "true" *> return true <|> string "false" *> return false) + + let pair ?(start = "(") ?(stop = ")") ?(sep = ",") p1 p2 = + skip_white *> string start *> skip_white *> p1 >>= fun x1 -> + skip_white *> string sep *> skip_white *> p2 >>= fun x2 -> + skip_white *> string stop *> return (x1, x2) + + let triple ?(start = "(") ?(stop = ")") ?(sep = ",") p1 p2 p3 = + string start *> skip_white *> p1 >>= fun x1 -> + skip_white *> string sep *> skip_white *> p2 >>= fun x2 -> + skip_white *> string sep *> skip_white *> p3 >>= fun x3 -> + string stop *> return (x1, x2, x3) +end + +module Debug_ = struct + let trace_fail name p = + { + run = + (fun st ~ok ~err -> + p.run st ~ok ~err:(fun e -> + Printf.eprintf "trace %s: fail with %s\n%!" name + (Error.to_string e); + err e)); + } + + let trace_ ~both name ~print p = + { + run = + (fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> + Printf.eprintf "trace %s: parsed %s\n%!" name (print x); + ok st x) + ~err:(fun e -> + if both then + Printf.eprintf "trace %s: fail with %s\n%!" name + (Error.to_string e); + err e)); + } + + let trace_success name ~print p = trace_ ~both:false name ~print p + let trace_success_or_fail name ~print p = trace_ ~both:true name ~print p +end diff --git a/src/parser/parser_comb.mli b/src/parser/parser_comb.mli new file mode 100644 index 00000000..11b9fa29 --- /dev/null +++ b/src/parser/parser_comb.mli @@ -0,0 +1,695 @@ +(** Simple Parser Combinators + +*) + +type position +(** A position in the input. Typically it'll point at the {b beginning} of + an error location. *) + +(** {2 Positions in input} + + *) +module Position : sig + type t = position + + val line : t -> int + (** Line number, 0 based *) + + val column : t -> int + (** Column number, 0 based *) + + val line_and_column : t -> int * int + (** Line and column number *) + + val min : t -> t -> t + val max : t -> t -> t + + val pp : Format.formatter -> t -> unit + (** Unspecified pretty-printed version of the position. *) +end + +(** {2 Errors} + *) +module Error : sig + type t + (** A parse error. + *) + + val position : t -> position + (** Returns position of the error *) + + val line_and_column : t -> int * int + (** Line and column numbers of the error position. *) + + val msg : t -> string + + val to_string : t -> string + (** Prints the error *) + + val pp : Format.formatter -> t -> unit + (** Pretty prints the error *) +end + +type +'a or_error = ('a, Error.t) result +(** ['a or_error] is either [Ok x] for some result [x : 'a], + or an error {!Error.t}. + + See {!stringify_result} and {!Error.to_string} to print the + error message. *) + +exception ParseError of Error.t + +(** {2 Input} *) + +(** {2 Combinators} *) + +type 'a t +(** The abstract type of parsers that return a value of type ['a] (or fail). + + @raise ParseError in case of failure. + the type is private. +*) + +val return : 'a -> 'a t +(** Always succeeds, without consuming its input. *) + +val pure : 'a -> 'a t +(** Synonym to {!return}. *) + +val map : ('a -> 'b) -> 'a t -> 'b t +val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t +val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t + +val bind : ('a -> 'b t) -> 'a t -> 'b t +(** [bind f p] results in a new parser which behaves as [p] then, + in case of success, applies [f] to the result. +*) + +val ap : ('a -> 'b) t -> 'a t -> 'b t +(** Applicative. + *) + +val eoi : unit t +(** Expect the end of input, fails otherwise. *) + +val empty : unit t +(** Succeed with [()]. + *) + +val fail : string -> 'a t +(** [fail msg] fails with the given message. It can trigger a backtrack. *) + +val failf : ('a, unit, string, 'b t) format4 -> 'a +(** [Format.sprintf] version of {!fail}. *) + +val fail_lazy : (unit -> string) -> 'a t +(** Like {!fail}, but only produce an error message on demand. + *) + +val parsing : string -> 'a t -> 'a t +(** [parsing s p] behaves the same as [p], with the information that + we are parsing [s], if [p] fails. + The message [s] is added to the error, it does not replace it, + not does the location change (the error still points to + the same location as in [p]). *) + +val set_error_message : string -> 'a t -> 'a t +(** [set_error_message msg p] behaves like [p], but if [p] fails, + [set_error_message msg p] fails with [msg] instead and at the current + position. The internal error message of [p] is just discarded. + *) + +val pos : position t +(** [pos] returns the current position in the buffer. + + *) + +val with_pos : 'a t -> ('a * position) t +(** [with_pos p] behaves like [p], but returns the (starting) position + along with [p]'s result. + + *) + +val any_char : char t +(** [any_char] parses any character. + It still fails if the end of input was reached. + *) + +val any_char_n : int -> string t +(** [any_char_n len] parses exactly [len] characters from the input. + Fails if the input doesn't contain at least [len] chars. + *) + +val char : char -> char t +(** [char c] parses the character [c] and nothing else. *) + +type slice +(** A slice of the input, as returned by some combinators such + as {!split_1} or {!split_list} or {!take}. + + The idea is that one can use some parsers to cut the input into slices, + e.g. split into lines, or split a line into fields (think CSV or TSV). + Then a variety of parsers can be used on each slice to extract data from + it using {!recurse}. + + Slices contain enough information to make it possible + for [recurse slice p] to report failures (if [p] fails) using locations + from the original input, not relative to the slice. + Therefore, even after splitting the input into lines using, say, {!each_line}, + a failure to parse the 500th line will be reported at line 500 and + not at line 1. + + *) + +(** Functions on slices. + *) +module Slice : sig + type t = slice + + val is_empty : t -> bool + (** Is the slice empty? *) + + val length : t -> int + (** Length of the slice *) + + val to_string : t -> string + (** Convert the slice into a string. + Linear time and memory in [length slice] *) +end + +val recurse : slice -> 'a t -> 'a t +(** [recurse slice p] parses the [slice] + (most likely obtained via another combinator, such as {!split_1} + or {!split_n}), using [p]. + + The slice contains a position which is used to relocate error + messages to their position in the whole input, not just relative to + the slice. + + *) + +val set_current_slice : slice -> unit t +(** [set_current_slice slice] replaces the parser's state with [slice]. + + *) + +val chars_fold : + f: + ('acc -> + char -> + [ `Continue of 'acc + | `Consume_and_stop of 'acc + | `Stop of 'acc + | `Fail of string ]) -> + 'acc -> + ('acc * slice) t +(** [chars_fold f acc0] folds over characters of the input. + Each char [c] is passed, along with the current accumulator, to [f]; + [f] can either: + + - stop, by returning [`Stop acc]. In this case the final accumulator [acc] + is returned, and [c] is not consumed. + - consume char and stop, by returning [`Consume_and_stop acc]. + - fail, by returning [`Fail msg]. In this case the parser fails + with the given message. + - continue, by returning [`Continue acc]. The parser continues to the + next char with the new accumulator. + + This is a generalization of of {!chars_if} that allows one to transform + characters on the fly, skip some, handle escape sequences, etc. + It can also be useful as a base component for a lexer. + + @return a pair of the final accumular, and the slice matched by the fold. + *) + +val chars_fold_transduce : + f: + ('acc -> + char -> + [ `Continue of 'acc + | `Yield of 'acc * char + | `Consume_and_stop + | `Stop + | `Fail of string ]) -> + 'acc -> + ('acc * string) t +(** Same as {!char_fold} but with the following differences: + + - returns a string along with the accumulator, rather than the slice + of all the characters accepted by [`Continue _]. + The string is built from characters returned by [`Yield]. + - new case [`Yield (acc, c)] adds [c] to the returned string + and continues parsing with [acc]. + + *) + +val guard : ?descr:string -> ('a -> bool) -> 'a t -> 'a t +(** [guard f p] is like [p], but fails if the value returned by [p] + does not satisfy [f]. + @param descr used to provide a better error message *) + +val take : int -> slice t +(** [take len] parses exactly [len] characters from the input. + Fails if the input doesn't contain at least [len] chars. + *) + +val take_if : (char -> bool) -> slice t +(** [take_if f] takes characters as long as they satisfy the predicate [f]. + *) + +val take1_if : ?descr:string -> (char -> bool) -> slice t +(** [take1_if f] takes characters as long as they satisfy the predicate [f]. + Fails if no character satisfies [f]. + @param descr describes what kind of character was expected, in case of error + *) + +val char_if : ?descr:string -> (char -> bool) -> char t +(** [char_if f] parses a character [c] if [f c = true]. + Fails if the next char does not satisfy [f]. + @param descr describes what kind of character was expected, in case of error *) + +val chars_if : (char -> bool) -> string t +(** [chars_if f] parses a string of chars that satisfy [f]. + Cannot fail. *) + +val chars1_if : ?descr:string -> (char -> bool) -> string t +(** Like {!chars_if}, but accepts only non-empty strings. + [chars1_if p] fails if the string accepted by [chars_if p] is empty. + [chars1_if p] is equivalent to [take1_if p >|= Slice.to_string]. + @param descr describes what kind of character was expected, in case of error *) + +val endline : char t +(** Parse '\n'. *) + +val space : char t +(** Tab or space. *) + +val white : char t +(** Tab or space or newline. *) + +val skip_chars : (char -> bool) -> unit t +(** Skip 0 or more chars satisfying the predicate. *) + +val skip_space : unit t +(** Skip ' ' and '\t'. *) + +val skip_white : unit t +(** Skip ' ' and '\t' and '\n'. *) + +val is_alpha : char -> bool +(** Is the char a letter? *) + +val is_num : char -> bool +(** Is the char a digit? *) + +val is_alpha_num : char -> bool +(** Is the char a letter or a digit? *) + +val is_space : char -> bool +(** True on ' ' and '\t'. *) + +val is_white : char -> bool +(** True on ' ' and '\t' and '\n'. *) + +val suspend : (unit -> 'a t) -> 'a t +(** [suspend f] is the same as [f ()], but evaluates [f ()] only + when needed. + + A practical use case is to implement recursive parsers manually, + as described in {!fix}. The parser is [let rec p () = …], + and [suspend p] can be used in the definition to use [p]. +*) + +val string : string -> string t +(** [string s] parses exactly the string [s], and nothing else. *) + +val exact : string -> string t +(** Alias to {!string}. + *) + +val many : 'a t -> 'a list t +(** [many p] parses [p] repeatedly, until [p] fails, and + collects the results into a list. *) + +val optional : _ t -> unit t +(** [optional p] tries to parse [p], and return [()] whether it + succeeded or failed. Cannot fail itself. + It consumes input if [p] succeeded (as much as [p] consumed), but + consumes not input if [p] failed. + *) + +val try_ : 'a t -> 'a t + [@@deprecated "plays no role anymore, just replace [try foo] with [foo]"] +(** [try_ p] is just like [p] (it used to play a role in backtracking + semantics but no more). + + @deprecated since 3.6 it can just be removed. See {!try_opt} if you want + to detect failure. *) + +val try_opt : 'a t -> 'a option t +(** [try_opt p] tries to parse using [p], and return [Some x] if [p] + succeeded with [x] (and consumes what [p] consumed). + Otherwise it returns [None] and consumes nothing. This cannot fail. + *) + +val many_until : until:_ t -> 'a t -> 'a list t +(** [many_until ~until p] parses as many [p] as it can until + the [until] parser successfully returns. + If [p] fails before that then [many_until ~until p] fails as well. + Typically [until] can be a closing ')' or another termination condition, + and what is consumed by [until] is also consumed by [many_until ~until p]. + + *) + +val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t +(** [try_or p1 ~f ~else_:p2] attempts to parse [x] using [p1], + and then becomes [f x]. + If [p1] fails, then it becomes [p2]. This can be useful if [f] is expensive + but only ever works if [p1] matches (e.g. after an opening parenthesis + or some sort of prefix). + +*) + +val try_or_l : ?msg:string -> ?else_:'a t -> (unit t * 'a t) list -> 'a t +(** [try_or_l ?else_ l] tries each pair [(test, p)] in order. + If the n-th [test] succeeds, then [try_or_l l] behaves like n-th [p], + whether [p] fails or not. If [test] consumes input, the state is restored + before calling [p]. + If they all fail, and [else_] is defined, then it behaves like [else_]. + If all fail, and [else_] is [None], then it fails as well. + + This is a performance optimization compared to {!(<|>)}. We commit to a + branch if the test succeeds, without backtracking at all. + It can also provide better error messages, because failures in the parser + will not be reported as failures in [try_or_l]. + + See {!lookahead_ignore} for a convenient way of writing the test conditions. + + @param msg error message if all options fail + + *) + +val or_ : 'a t -> 'a t -> 'a t +(** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2] + from the same position. + *) + +val both : 'a t -> 'b t -> ('a * 'b) t +(** [both a b] parses [a], then [b], then returns the pair of their results. + *) + +val many1 : 'a t -> 'a list t +(** [many1 p] is like [many p] excepts it fails if the + list is empty (i.e. it needs [p] to succeed at least once). *) + +val skip : _ t -> unit t +(** [skip p] parses zero or more times [p] and ignores its result. + It is eager, meaning it will continue as long as [p] succeeds. + As soon as [p] fails, [skip p] stops consuming any input. *) + +val sep : by:_ t -> 'a t -> 'a list t +(** [sep ~by p] parses a list of [p] separated by [by]. *) + +val sep_until : until:_ t -> by:_ t -> 'a t -> 'a list t +(** Same as {!sep} but stop when [until] parses successfully. + *) + +val sep1 : by:_ t -> 'a t -> 'a list t +(** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *) + +val lookahead : 'a t -> 'a t +(** [lookahead p] behaves like [p], except it doesn't consume any input. + *) + +val lookahead_ignore : 'a t -> unit t +(** [lookahead_ignore p] tries to parse input with [p], + and succeeds if [p] succeeds. However it doesn't consume any input + and returns [()], so in effect its only use-case is to detect + whether [p] succeeds, e.g. in {!try_or_l}. + + *) + +val fix : ('a t -> 'a t) -> 'a t +(** Fixpoint combinator. [fix (fun self -> p)] is the parser [p], + in which [self] refers to the parser [p] itself (which is useful to + parse recursive structures. + + An alternative, manual implementation to [let p = fix (fun self -> q)] + is: + {[ let rec p () = + let self = suspend p in + q + ]} +*) + +val line : slice t +(** Parse a line, ['\n'] excluded, and position the cursor after the ['\n']. + *) + +val line_str : string t +(** [line_str] is [line >|= Slice.to_string]. + It parses the next line and turns the slice into a string. + The state points to the character immediately after the ['\n'] character. + *) + +val each_line : 'a t -> 'a list t +(** [each_line p] runs [p] on each line of the input. + *) + +val split_1 : on_char:char -> (slice * slice option) t +(** [split_1 ~on_char] looks for [on_char] in the input, and returns a + pair [sl1, sl2], where: + + - [sl1] is the slice of the input the precedes the first occurrence + of [on_char], or the whole input if [on_char] cannot be found. + It does not contain [on_char]. + - [sl2] is the slice that comes after [on_char], + or [None] if [on_char] couldn't be found. It doesn't contain the first + occurrence of [on_char] (if any). + + The parser is now positioned at the end of the input. + *) + +val split_list : on_char:char -> slice list t +(** [split_list ~on_char] splits the input on all occurrences of [on_char], + returning a list of slices. + *) + +val split_list_at_most : on_char:char -> int -> slice list t +(** [split_list_at_most ~on_char n] applies [split_1 ~on_char] at most + [n] times, to get a list of [n+1] elements. + The last element might contain [on_char]. This is useful to limit the + amount of work done by {!split_list}. + *) + +val split_2 : on_char:char -> (slice * slice) t +(** [split_2 ~on_char] splits the input into exactly 2 fields, + and fails if the split yields less or more than 2 items. + *) + +val split_3 : on_char:char -> (slice * slice * slice) t +(** See {!split_2} + *) + +val split_4 : on_char:char -> (slice * slice * slice * slice) t +(** See {!split_2} + *) + +val each_split : on_char:char -> 'a t -> 'a list t +(** [split_list_map ~on_char p] uses [split_list ~on_char] to split + the input, then parses each chunk of the input thus obtained using [p]. + + The difference with [sep ~by:(char on_char) p] is that + [sep] calls [p] first, and only tries to find [on_char] after [p] returns. + While it is more flexible, this technique also means [p] has to be careful + not to consume [on_char] by error. + + A useful specialization of this is {!each_line}, which is + basically [each_split ~on_char:'\n' p]. + *) + +val all : slice t +(** [all] returns all the unconsumed input as a slice, and consumes it. + Use {!Slice.to_string} to turn it into a string. + + Note that [lookahead all] can be used to {i peek} at the rest of the input + without consuming anything. + + *) + +val all_str : string t +(** [all_str] accepts all the remaining chars and extracts them into a + string. Similar to {!all} but with a string. + *) + +val memo : 'a t -> 'a t +(** Memoize the parser. [memo p] will behave like [p], but when called + in a state (read: position in input) it has already processed, [memo p] + returns a result directly. The implementation uses an underlying + hashtable. + This can be costly in memory, but improve the run time a lot if there + is a lot of backtracking involving [p]. + + Do not call {!memo} inside other functions, especially with {!(>>=)}, + {!map}, etc. being so prevalent. Instead the correct way to use it + is in a toplevel definition: + + {[ + let my_expensive_parser = memo (foo *> bar >>= fun i -> …) + ]} + + This function is not thread-safe. *) + +val fix_memo : ('a t -> 'a t) -> 'a t +(** Like {!fix}, but the fixpoint is memoized. *) + +(** {2 Infix} *) + +module Infix : sig + val ( >|= ) : 'a t -> ('a -> 'b) -> 'b t + (** Alias to {!map}. [p >|= f] parses an item [x] using [p], + and returns [f x]. *) + + val ( >>= ) : 'a t -> ('a -> 'b t) -> 'b t + (** Alias to {!bind}. + [p >>= f] results in a new parser which behaves as [p] then, + in case of success, applies [f] to the result. *) + + val ( <*> ) : ('a -> 'b) t -> 'a t -> 'b t + (** Applicative. *) + + val ( <* ) : 'a t -> _ t -> 'a t + (** [a <* b] parses [a] into [x], parses [b] and ignores its result, + and returns [x]. *) + + val ( *> ) : _ t -> 'a t -> 'a t + (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The + result of [a] is ignored. *) + + val ( <|> ) : 'a t -> 'a t -> 'a t + (** Alias to {!or_}. + + [a <|> b] tries to parse [a], and if [a] fails without + consuming any input, backtracks and tries + to parse [b], otherwise it fails as [a]. *) + + val ( ) : 'a t -> string -> 'a t + (** [a msg] behaves like [a], but if [a] fails, + [a msg] fails with [msg] instead. + Useful as the last choice in a series of [<|>]. For example: + [a <|> b <|> c "expected one of a, b, c"]. *) + + val ( ||| ) : 'a t -> 'b t -> ('a * 'b) t + (** Alias to {!both}. + [a ||| b] parses [a], then [b], then returns the pair of their results. + *) + + val ( let+ ) : 'a t -> ('a -> 'b) -> 'b t + val ( let* ) : 'a t -> ('a -> 'b t) -> 'b t + val ( and+ ) : 'a t -> 'b t -> ('a * 'b) t + val ( and* ) : 'a t -> 'b t -> ('a * 'b) t +end + +include module type of Infix + +(** {2 Parse input} *) + +val stringify_result : 'a or_error -> ('a, string) result +(** Turn a {!Error.t}-oriented result into a more basic string result. + *) + +val parse_string : 'a t -> string -> ('a, string) result +(** Parse a string using the parser. *) + +val parse_string_e : 'a t -> string -> 'a or_error +(** Version of {!parse_string} that returns a more detailed error. *) + +val parse_string_exn : 'a t -> string -> 'a +(** @raise ParseError if it fails. *) + +val parse_file : 'a t -> string -> ('a, string) result +(** [parse_file p filename] parses file named [filename] with [p] + by opening the file and reading it whole. *) + +val parse_file_e : 'a t -> string -> 'a or_error +(** Version of {!parse_file} that returns a more detailed error. *) + +val parse_file_exn : 'a t -> string -> 'a +(** Same as {!parse_file}, but + @raise ParseError if it fails. *) + +(** {2 Utils} + + This is useful to parse OCaml-like values in a simple way. + All the parsers are whitespace-insensitive (they skip whitespace). *) +module U : sig + val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t + (** [list p] parses a list of [p], with the OCaml conventions for + start token "\[", stop token "\]" and separator ";". + Whitespace between items are skipped. *) + + (* TODO: parse option? *) + (* TODO: split on whitespace? *) + + val int : int t + (** Parse an int in decimal representation. *) + + val in_paren : 'a t -> 'a t + (** [in_paren p] parses an opening "(",[p] , and then ")". + *) + + val in_parens_opt : 'a t -> 'a t + (** [in_parens_opt p] parses [p] in an arbitrary number of nested + parenthesis (possibly 0). + *) + + val option : 'a t -> 'a option t + (** [option p] parses "Some " into [Some x] if [p] parses "" into [x], + and parses "None" into [None]. + *) + + val hexa_int : int t + (** Parse an int int hexadecimal format. Accepts an optional [0x] prefix, + and ignores capitalization. + *) + + val word : string t + (** Non empty string of alpha num, start with alpha. *) + + val bool : bool t + (** Accepts "true" or "false" + *) + + (* TODO: quoted string *) + + val pair : + ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'b t -> ('a * 'b) t + (** Parse a pair using OCaml syntactic conventions. + The default is "(a, b)". *) + + val triple : + ?start:string -> + ?stop:string -> + ?sep:string -> + 'a t -> + 'b t -> + 'c t -> + ('a * 'b * 'c) t + (** Parse a triple using OCaml syntactic conventions. + The default is "(a, b, c)". *) +end + +(** Debugging utils. *) +module Debug_ : sig + val trace_fail : string -> 'a t -> 'a t + (** [trace_fail name p] behaves like [p], but prints the error message of [p] + on stderr whenever [p] fails. + @param name used as a prefix of all trace messages. *) + + val trace_success : string -> print:('a -> string) -> 'a t -> 'a t + (** [trace_success name ~print p] behaves like [p], but + prints successful runs of [p] using [print]. *) + + val trace_success_or_fail : string -> print:('a -> string) -> 'a t -> 'a t + (** Trace both error or success *) +end diff --git a/src/parser/sidekick_parser.ml b/src/parser/sidekick_parser.ml new file mode 100644 index 00000000..150792e9 --- /dev/null +++ b/src/parser/sidekick_parser.ml @@ -0,0 +1,3 @@ +module Ast_term = Ast_term +module Parser_comb = Parser_comb +module Parse_term = Parse_term