From 37af485971cc0b5e39780b405940e42f11aa01e5 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 15:08:13 -0400 Subject: [PATCH] parse: expose Position module, add `or_`, `both`, `lookahead`, `U.bool` --- src/core/CCParse.ml | 174 +++++++++++++++++++++------------ src/core/CCParse.mli | 224 ++++++++++++++++++++++++++----------------- 2 files changed, 248 insertions(+), 150 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index d26d86b3..22e6af88 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -107,37 +107,64 @@ open CCShims_ () *) -module Error = struct - type t = { - msg: unit -> string; - str: string; - offset: int; (* offset in [e_str] *) - } +(* TODO: [type position = {state: state; i: int}] and recompute line, col + on demand *) +type position = { + pos_buffer: string; + pos_offset: int; + mutable pos_lc: (int * int) option; +} - let get_loc_ (self:t) : int * int = +module Position = struct + type t = position + + (* actually re-compute line and column from the buffer *) + let compute_line_and_col_ (self:t) : int * int = let i = ref 0 in let continue = ref true in let line = ref 1 in let col = ref 1 in - while !continue && !i < self.offset do - match String.index_from self.str !i '\n' with + while !continue && !i < self.pos_offset do + match String.index_from self.pos_buffer !i '\n' with | exception Not_found -> - col := self.offset - !i; continue := false; - | j when j > self.offset -> - col := self.offset - !i; continue := false; + col := self.pos_offset - !i; continue := false; + | j when j > self.pos_offset -> + col := self.pos_offset - !i; continue := false; | j -> incr line; i := j+1; done; !line, !col - let line_and_column self = get_loc_ self + let line_and_column self = + match self.pos_lc with + | Some tup -> tup + | None -> + let tup = compute_line_and_col_ self in + self.pos_lc <- Some tup; (* save *) + tup + + let line self = fst (line_and_column self) + let column self = snd (line_and_column self) + let pp out self = + let l, c = line_and_column self in + Format.fprintf out "at line %d, column %d" l c +end + +module Error = struct + type t = { + msg: unit -> string; + pos: position; + } + + let position self = self.pos + let line_and_column self = Position.line_and_column self.pos let msg self = self.msg() let to_string self = - let line,col = get_loc_ self in + let line,col = line_and_column self in Printf.sprintf "at line %d, char %d:\n%s" line col (self.msg()) let pp out self = - let line,col = get_loc_ self in + let line,col = line_and_column self in Format.fprintf out "at line %d, char %d:@ %s" line col (self.msg()) end @@ -157,10 +184,6 @@ module Memo_state = struct let id_ = ref 0 end -(* TODO: [type position = {state: state; i: int}] and recompute line, col - on demand *) -type position = int * int * int (* pos, line, column *) - (** Purely functional state passed around *) type state = { str: string; (* the input *) @@ -192,8 +215,8 @@ let state_of_string str = let[@inline] is_done st = st.i >= String.length st.str let[@inline] cur st = st.str.[st.i] -let mk_error_ st msg : Error.t = - {Error.msg; str=st.str; offset=st.i} +let pos_of_st_ st : position = {pos_buffer=st.str; pos_offset=st.i; pos_lc=None} +let mk_error_ st msg : Error.t = {Error.msg; pos=pos_of_st_ st} (* consume one char, passing it to [ok]. *) let consume_ st ~ok ~err = @@ -221,14 +244,14 @@ let return x : _ t = { let pure = return -let (>|=) (p: 'a t) f : _ t = { +let map f (p: 'a t) : _ t = { run=fun st ~ok ~err -> p.run st ~ok:(fun st x -> ok st (f x)) ~err } -let (>>=) (p:'a t) f : _ t = { +let bind f (p:'a t) : _ t = { run=fun st ~ok ~err -> p.run st ~ok:(fun st x -> @@ -237,7 +260,7 @@ let (>>=) (p:'a t) f : _ t = { ~err } -let (<*>) (f:_ t) (a:_ t) : _ t = { +let ap (f:_ t) (a:_ t) : _ t = { run=fun st ~ok ~err -> f.run st ~ok:(fun st f -> @@ -245,7 +268,7 @@ let (<*>) (f:_ t) (a:_ t) : _ t = { ~err } -let (<*) (a:_ t) (b:_ t) : _ t = { +let ap_left (a:_ t) (b:_ t) : _ t = { run=fun st ~ok ~err -> a.run st ~ok:(fun st x -> @@ -253,7 +276,7 @@ let (<*) (a:_ t) (b:_ t) : _ t = { ~err } -let ( *> ) (a:_ t) (b:_ t) : _ t = { +let ap_right (a:_ t) (b:_ t) : _ t = { run=fun st ~ok ~err -> a.run st ~ok:(fun st _ -> @@ -261,7 +284,47 @@ let ( *> ) (a:_ t) (b:_ t) : _ t = { ~err } -let map f x = x >|= f +let or_ (p1:'a t) (p2:'a t) : _ t = { + run=fun st ~ok ~err -> + p1.run st ~ok + ~err:(fun _e -> p2.run st ~ok ~err) +} + +let both a b = { + run=fun st ~ok ~err -> + a.run st + ~ok:(fun st xa -> + b.run st ~ok:(fun st xb -> ok st (xa,xb)) ~err) + ~err +} + +let set_error_message msg (p:'a t) : _ t = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun _e -> err (mk_error_ st (const_str_ msg))) +} + + +module Infix = struct + let[@inline] (>|=) p f = map f p + let[@inline] (>>=) p f = bind f p + let (<*>) = ap + let (<* ) = ap_left + let ( *>) = ap_right + let (<|>) = or_ + let (|||) = both + let[@inline] () p msg = set_error_message msg p + + include CCShimsMkLet_.Make(struct + type nonrec 'a t = 'a t + let (>>=) = (>>=) + let (>|=) = (>|=) + let monoid_product = both + end) +end + +include Infix + let map2 f x y = pure f <*> x <*> y let map3 f x y z = pure f <*> x <*> y <*> z @@ -390,15 +453,6 @@ let endline = let skip_space = skip_chars is_space let skip_white = skip_chars is_white -let or_ (p1:'a t) (p2:'a t) : _ t = { - run=fun st ~ok ~err -> - p1.run st ~ok - ~err:(fun _e -> p2.run st ~ok ~err) -} -let (<|>) = or_ - -let (|||) a b = map2 (fun x y ->x,y) a b - let try_or p1 ~f ~else_:p2 = { run=fun st ~ok ~err -> p1.run st @@ -412,12 +466,6 @@ let suspend f = { p.run st ~ok ~err } -let () (p:'a t) msg : _ t = { - run=fun st ~ok ~err -> - p.run st ~ok - ~err:(fun _e -> err (mk_error_ st (const_str_ msg))) -} - (* read [len] chars at once *) let any_chars len : _ t = { run=fun st ~ok ~err -> @@ -578,6 +626,17 @@ let sep1 ~by p = sep ~by p >|= fun tl -> x :: tl +let lookahead p : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun _st x -> ok st x) (* discard old state *) + ~err +} + +(*$= & ~printer:(errpp Q.Print.(string)) + (Ok "abc") (parse_string (lookahead (string "ab") *> (string "abc")) "abcd") +*) + let line : _ t = { run=fun st ~ok ~err -> if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI")) @@ -609,7 +668,13 @@ let parse_sub_ p_sub p : _ t = { p.run (state_of_string s) ~ok:(fun _ r -> ok st1 r) ~err:(fun e -> - err {e with Error.str=st0.str; offset=e.Error.offset + st0.i})) + let pos = e.pos in + let pos = { + pos_buffer=pos.pos_buffer; + pos_offset=pos.pos_offset + st0.i; + pos_lc=None; + } in + err {e with pos})) ~err } @@ -713,17 +778,6 @@ let parse_file_exn p file = | Ok x -> x | Error e -> raise (ParseError e) -module Infix = struct - let (>|=) = (>|=) - let (>>=) = (>>=) - let (<*>) = (<*>) - let (<* ) = (<* ) - let ( *>) = ( *>) - let (<|>) = (<|>) - let (|||) = (|||) - let () = () -end - module U = struct let sep_ = sep @@ -771,6 +825,12 @@ module U = struct let word = map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) + let bool = (string "true" *> return true) <|> (string "false" *> return false) + (*$= & ~printer:(errpp Q.Print.bool) ~cmp:(erreq (=)) + (Ok true) (parse_string U.bool "true") + (Ok false) (parse_string U.bool "false") + *) + let pair ?(start="(") ?(stop=")") ?(sep=",") p1 p2 = skip_white *> string start *> skip_white *> p1 >>= fun x1 -> @@ -791,9 +851,3 @@ module U = struct p3 >>= fun x3 -> string stop *> return (x1,x2,x3) end - -include CCShimsMkLet_.Make(struct - type nonrec 'a t = 'a t - include Infix - let monoid_product = (|||) - end) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index cba724aa..5fa8ea17 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -4,7 +4,11 @@ (** {1 Very Simple Parser Combinators} These combinators can be used to write very simple parsers, for example - to extract data from a line-oriented file. + to extract data from a line-oriented file, or as a replacement to {!Scanf}. + + {2 A few examples} + + {4 Parse a tree} {[ open CCParse;; @@ -16,7 +20,7 @@ let ptree = fix @@ fun self -> skip_space *> - ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')') <|> (U.int >|= mk_leaf) ) ;; @@ -49,8 +53,35 @@ assert (l=l');; ]} + {2 Stability guarantees} + + Some functions are marked "experimental" and are still subject to change. + *) +type position +(** A position in the input. Typically it'll point at the {b beginning} of + an error location. *) + +(** {2 Positions in input} + + @since NEXT_RELEASE *) +module Position : sig + type t = position + + val line : t -> int + (** Line number *) + + val column : t -> int + (** Column number *) + + val line_and_column : t -> int * int + (** Line and column number *) + + val pp : Format.formatter -> t -> unit + (** Unspecified pretty-printed version of the position. *) +end + (** {2 Errors} @since NEXT_RELEASE *) module Error : sig @@ -58,6 +89,9 @@ module Error : sig (** A parse error. @since NEXT_RELEASE *) + val position : t -> position + (** Returns position of the error *) + val line_and_column : t -> int * int (** Line and column numbers of the error position. *) @@ -71,15 +105,16 @@ module Error : sig end type 'a or_error = ('a, Error.t) result -(* TODO: use [('a, error) result] instead, with easy conversion to [('a, string) result] *) +(** ['a or_error] is either [Ok x] for some result [x : 'a], + or an error {!Error.t}. + + See {!stringify_result} and {!Error.to_string} to print the + error message. *) exception ParseError of Error.t (** {2 Input} *) -type position -(* TODO: make a module Position: sig type t val line : t -> int val col : t -> int *) - (** {2 Combinators} *) type 'a t @@ -95,33 +130,20 @@ val return : 'a -> 'a t val pure : 'a -> 'a t (** Synonym to {!return}. *) -val (>|=) : 'a t -> ('a -> 'b) -> 'b t -(** Map. *) - val map : ('a -> 'b) -> 'a t -> 'b t val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t -val (>>=) : 'a t -> ('a -> 'b t) -> 'b t -(** Monadic bind. - [p >>= f] results in a new parser which behaves as [p] then, - in case of success, applies [f] to the result. *) +val bind : ('a -> 'b t) -> 'a t -> 'b t +(** [bind f p] results in a new parser which behaves as [p] then, + in case of success, applies [f] to the result. + @since NEXT_RELEASE +*) -val (<*>) : ('a -> 'b) t -> 'a t -> 'b t -(** Applicative. *) - -val (<* ) : 'a t -> _ t -> 'a t -(** [a <* b] parses [a] into [x], parses [b] and ignores its result, - and returns [x]. *) - -val ( *>) : _ t -> 'a t -> 'a t -(** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - results of [a] is ignored. *) - -val (|||) : 'a t -> 'b t -> ('a * 'b) t -(** [a ||| b] parses [a], then [b], then returns the pair of their results. +val ap : ('a -> 'b) t -> 'a t -> 'b t +(** Applicative. @since NEXT_RELEASE *) val fail : string -> 'a t @@ -196,19 +218,6 @@ val is_space : char -> bool val is_white : char -> bool (** True on ' ' and '\t' and '\n'. *) -val (<|>) : 'a t -> 'a t -> 'a t -(** [a <|> b] tries to parse [a], and if [a] fails without - consuming any input, backtracks and tries - to parse [b], otherwise it fails as [a]. - See {!try_} to ensure [a] does not consume anything (but it is best - to avoid wrapping large parsers with {!try_}). *) - -val () : 'a t -> string -> 'a t -(** [a msg] behaves like [a], but if [a] fails, - [a ]. For example: - [a <|> b <|> c "expected one of a, b, c"]. *) - val suspend : (unit -> 'a t) -> 'a t (** [suspend f] is the same as [f ()], but evaluates [f ()] only when needed. *) @@ -230,12 +239,12 @@ val optional : _ t -> unit t @since NEXT_RELEASE *) val try_ : 'a t -> 'a t +[@@deprecated "plays no role anymore, just replace [try foo] with [foo]"] (** [try_ p] is just like [p] (it used to play a role in backtracking semantics but no more). @deprecated since NEXT_RELEASE it can just be removed. See {!try_opt} if you want to detect failure. *) -[@@deprecated "plays no role anymore"] val try_opt : 'a t -> 'a option t (** [try_ p] tries to parse using [p], and return [Some x] if [p] @@ -247,12 +256,17 @@ val many_until : until:_ t -> 'a t -> 'a list t the [until] parser successfully returns. If [p] fails before that then [many_until ~until p] fails as well. Typically [until] can be a closing ')' or another termination condition. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t -(** [try_or p1 ~f p2] attempts to parse [x] using [p1], +(** [try_or p1 ~f ~else_:p2] attempts to parse [x] using [p1], and then becomes [f x]. - If [p1] fails, then it becomes [p2]. + If [p1] fails, then it becomes [p2]. This can be useful if [f] is expensive + but only ever works if [p1] matches (e.g. after an opening parenthesis + or some sort of prefix). @since NEXT_RELEASE *) @@ -261,6 +275,16 @@ val or_ : 'a t -> 'a t -> 'a t from the same position. @since NEXT_RELEASE *) +val both : 'a t -> 'b t -> ('a * 'b) t +(** [both a b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) + +val set_error_message : string -> 'a t -> 'a t +(** [set_error_message msg p] behaves like [p], but if [p] fails, + [set_error_message msg p] fails with [msg] instead. + @since NEXT_RELEASE +*) + val many1 : 'a t -> 'a list t (** [many1 p] is like [many p] excepts it fails if the list is empty (i.e. it needs [p] to succeed at least once). *) @@ -271,8 +295,6 @@ val skip : _ t -> unit t val sep : by:_ t -> 'a t -> 'a list t (** [sep ~by p] parses a list of [p] separated by [by]. *) -(* TODO: lookahead? *) - val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t (** Same as {!sep} but stop when [until] parses successfully. @since NEXT_RELEASE *) @@ -280,12 +302,19 @@ val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t val sep1 : by:_ t -> 'a t -> 'a list t (** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *) +val lookahead : 'a t -> 'a t +(** [lookahead p] behaves like [p], except it doesn't consume any input. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + val line : string t (** Parse a line, '\n' excluded. @since NEXT_RELEASE *) val each_line : 'a t -> 'a list t (** [each_line p] runs [p] on each line of the input. + {b EXPERIMENTAL} @since NEXT_RELEASE *) val fix : ('a t -> 'a t) -> 'a t @@ -299,15 +328,70 @@ val memo : 'a t -> 'a t This can be costly in memory, but improve the run time a lot if there is a lot of backtracking involving [p]. + Do not call {!memo} inside other functions, especially with {!(>>=)}, + {!map}, etc. being so prevalent. Instead the correct way to use it + is in a toplevel definition: + + {[ + let my_expensive_parser = memo (foo *> bar >>= fun i -> …) + ]} + This function is not thread-safe. *) val fix_memo : ('a t -> 'a t) -> 'a t (** Like {!fix}, but the fixpoint is memoized. *) -(** {2 Parse} +(** {2 Infix} *) - Those functions have a label [~p] on the parser, since 0.14. -*) +module Infix : sig + val (>|=) : 'a t -> ('a -> 'b) -> 'b t + (** Alias to {!map}. [p >|= f] parses an item [x] using [p], + and returns [f x]. *) + + val (>>=) : 'a t -> ('a -> 'b t) -> 'b t + (** Alias to {!bind}. + [p >>= f] results in a new parser which behaves as [p] then, + in case of success, applies [f] to the result. *) + + val (<*>) : ('a -> 'b) t -> 'a t -> 'b t + (** Applicative. *) + + val (<* ) : 'a t -> _ t -> 'a t + (** [a <* b] parses [a] into [x], parses [b] and ignores its result, + and returns [x]. *) + + val ( *>) : _ t -> 'a t -> 'a t + (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The + result of [a] is ignored. *) + + val (<|>) : 'a t -> 'a t -> 'a t + (** Alias to {!or_}. + + [a <|> b] tries to parse [a], and if [a] fails without + consuming any input, backtracks and tries + to parse [b], otherwise it fails as [a]. + See {!try_} to ensure [a] does not consume anything (but it is best + to avoid wrapping large parsers with {!try_}). *) + + val () : 'a t -> string -> 'a t + (** [a msg] behaves like [a], but if [a] fails, + [a ]. For example: + [a <|> b <|> c "expected one of a, b, c"]. *) + + val (|||) : 'a t -> 'b t -> ('a * 'b) t + (** Alias to {!both}. + [a ||| b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) + + (** Let operators on OCaml >= 4.08.0, nothing otherwise + @since 2.8 *) + include CCShimsMkLet_.S with type 'a t_let := 'a t +end + +include module type of Infix + +(** {2 Parse input} *) val stringify_result : 'a or_error -> ('a, string) result (** Turn a {!Error.t}-oriented result into a more basic string result. @@ -333,44 +417,6 @@ val parse_file_exn : 'a t -> string -> 'a (** Same as {!parse_file}, but @raise ParseError if it fails. *) -(** {2 Infix} *) - -module Infix : sig - val (>|=) : 'a t -> ('a -> 'b) -> 'b t - (** Map. [p >|= f] parses an item [x] using [p], - and returns [f x]. *) - - val (>>=) : 'a t -> ('a -> 'b t) -> 'b t - (** Monadic bind. - [p >>= f] results in a new parser which behaves as [p] then, - in case of success, applies [f] to the result. *) - - val (<*>) : ('a -> 'b) t -> 'a t -> 'b t - (** Applicative. *) - - val (<* ) : 'a t -> _ t -> 'a t - (** [a <* b] parses [a] into [x], parses [b] and ignores its result, - and returns [x]. *) - - val ( *>) : _ t -> 'a t -> 'a t - (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - result of [a] is ignored. *) - - val (<|>) : 'a t -> 'a t -> 'a t - (** [a <|> b] tries to parse [a], and if [a] fails, it backtracks and tries - to parse [b]. - Alias to {!or_} *) - - val (|||) : 'a t -> 'b t -> ('a * 'b) t - (** [a ||| b] parses [a], then [b], then returns the pair of their results. - @since NEXT_RELEASE *) - - val () : 'a t -> string -> 'a t - (** [a msg] behaves like [a], but if [a] fails, - it fails with [msg] instead. Useful as the last choice in a series of - [<|>]: [a <|> b <|> c "expected a|b|c"]. *) - -end (** {2 Utils} @@ -396,7 +442,9 @@ module U : sig val word : string t (** Non empty string of alpha num, start with alpha. *) - (* TODO: boolean literal *) + val bool : bool t + (** Accepts "true" or "false" *) + (* TODO: quoted string *) val pair : ?start:string -> ?stop:string -> ?sep:string -> @@ -409,7 +457,3 @@ module U : sig (** Parse a triple using OCaml syntactic conventions. The default is "(a, b, c)". *) end - -(** Let operators on OCaml >= 4.08.0, nothing otherwise - @since 2.8 *) -include CCShimsMkLet_.S with type 'a t_let := 'a t