ocaml-containers/src/core/CCParse.mli


(* This file is free software. See file "license" for more details. *)

(** Very Simple Parser Combinators

    These combinators can be used to write very simple parsers, for example
    to extract data from a line-oriented file, or as a replacement to {!Scanf}.

    {2 A few examples}

    Some more advanced example(s) can be found in the [/examples] directory.

    {4 Parse a tree}

    {[
      open CCParse;;

      type tree = L of int | N of tree * tree;;

      let mk_leaf x = L x
      let mk_node x y = N(x,y)

      let ptree = fix @@ fun self ->
        skip_space *>
          ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')')
            <|>
              (U.int >|= mk_leaf) )
      ;;

      parse_string_exn ptree "(1 (2 3))" ;;
      parse_string_exn ptree "((1 2) (3 (4 5)))" ;;

    ]}

    {4 Parse a list of words}

    {[
      open Containers.Parse;;
      let p = U.list ~sep:"," U.word;;
      parse_string_exn p "[abc , de, hello ,world  ]";;
    ]}

    {4 Stress Test}
    This makes a list of 100_000 integers, prints it and parses it back.

    {[
      let p = CCParse.(U.list ~sep:"," U.int);;

      let l = CCList.(1 -- 100_000);;
      let l_printed =
        CCFormat.(to_string (within "[" "]" (list ~sep:(return ",@,") int))) l;;

      let l' = CCParse.parse_string_exn p l_printed;;

      assert (l=l');;
    ]}

    {2 Stability guarantees}

    Some functions are marked "experimental" and are still subject to change.

*)

type position
(** A position in the input. Typically it'll point at the {b beginning} of
    an error location. *)

(** {2 Positions in input}

    @since 3.6 *)
module Position : sig
  type t = position

  val line : t -> int
  (** Line number, 0 based *)

  val column : t -> int
  (** Column number, 0 based *)

  val line_and_column : t -> int * int
  (** Line and column number *)

  val pp : Format.formatter -> t -> unit
  (** Unspecified pretty-printed version of the position. *)
end

(** {2 Errors}
    @since 3.6 *)
module Error : sig
  type t
  (** A parse error.
      @since 3.6 *)

  val position : t -> position
  (** Returns position of the error *)

  val line_and_column : t -> int * int
  (** Line and column numbers of the error position. *)

  val msg : t -> string

  val to_string : t -> string
  (** Prints the error *)

  val pp : Format.formatter -> t -> unit
  (** Pretty prints the error *)
end

type +'a or_error = ('a, Error.t) result
(** ['a or_error] is either [Ok x] for some result [x : 'a],
    or an error {!Error.t}.

    See {!stringify_result} and {!Error.to_string} to print the
    error message. *)

exception ParseError of Error.t

(** {2 Input} *)

(** {2 Combinators} *)

type 'a t
(** The abstract type of parsers that return a value of type ['a] (or fail).

    @raise ParseError in case of failure.
    @since 3.6 the type is private.
*)

val return : 'a -> 'a t
(** Always succeeds, without consuming its input. *)

val pure : 'a -> 'a t
(** Synonym to {!return}. *)

val map : ('a -> 'b) -> 'a t -> 'b t

val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t

val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t

val bind : ('a -> 'b t) -> 'a t -> 'b t
(** [bind f p] results in a new parser which behaves as [p] then,
    in case of success, applies [f] to the result.
    @since 3.6
*)

val ap : ('a -> 'b) t -> 'a t -> 'b t
(** Applicative.
    @since 3.6 *)

val eoi : unit t
(** Expect the end of input, fails otherwise. *)

val empty : unit t
(** Succeed with [()].
    @since 3.6 *)

val fail : string -> 'a t
(** [fail msg] fails with the given message. It can trigger a backtrack. *)

val failf: ('a, unit, string, 'b t) format4 -> 'a
(** [Format.sprintf] version of {!fail}. *)

val fail_lazy : (unit -> string) -> 'a t
(** Like {!fail}, but only produce an error message on demand.
    @since 3.6 *)

val parsing : string -> 'a t -> 'a t
(** [parsing s p] behaves the same as [p], with the information that
    we are parsing [s], if [p] fails.
    The message [s] is added to the error, it does not replace it,
    not does the location change (the error still points to
    the same location as in [p]). *)

val set_error_message : string -> 'a t -> 'a t
(** [set_error_message msg p] behaves like [p], but if [p] fails,
    [set_error_message msg p] fails with [msg] instead and at the current
    position. The internal error message of [p] is just discarded.
    @since 3.6 *)

val pos : position t
(** [pos] returns the current position in the buffer.

    {b EXPERIMENTAL}
    @since NEXT_RELEASE *)

val with_pos : 'a t -> ('a * position) t
(** [with_pos p] behaves like [p], but returns the (starting) position
    along with [p]'s result.

    {b EXPERIMENTAL}
    @since 3.6 *)

val any_char : char t
(** [any_char] parses any character.
    It still fails if the end of input was reached.
    @since 3.6 *)

val any_char_n : int -> string t
(** [any_char_n len] parses exactly [len] characters from the input.
    Fails if the input doesn't contain at least [len] chars.
    @since 3.6 *)

val char : char -> char t
(** [char c] parses the character [c] and nothing else. *)

type slice
(** A slice of the input, as returned by some combinators such
    as {!split_1} or {!split_list} or {!take}.

    The idea is that one can use some parsers to cut the input into slices,
    e.g. split into lines, or split a line into fields (think CSV or TSV).
    Then a variety of parsers can be used on each slice to extract data from
    it using {!recurse}.

    Slices contain enough information to make it possible
    for [recurse slice p] to report failures (if [p] fails) using locations
    from the original input, not relative to the slice.
    Therefore, even after splitting the input into lines using, say, {!each_line},
    a failure to parse the 500th line will be reported at line 500 and
    not at line 1.

    {b EXPERIMENTAL}
    @since 3.6 *)

(** Functions on slices.
    @since 3.6 *)
module Slice : sig
  type t = slice

  val is_empty : t -> bool
  (** Is the slice empty? *)

  val length : t -> int
  (** Length of the slice *)

  val to_string : t -> string
  (** Convert the slice into a string.
      Linear time and memory in [length slice] *)
end

val recurse : slice -> 'a t -> 'a t
(** [recurse slice p] parses the [slice]
    (most likely obtained via another combinator, such as {!split_1}
    or {!split_n}), using [p].

    The slice contains a position which is used to relocate error
    messages to their position in the whole input, not just relative to
    the slice.

    {b EXPERIMENTAL}
    @since 3.6 *)

val set_current_slice : slice -> unit t
(** [set_current_slice slice] replaces the parser's state with [slice].

    {b EXPERIMENTAL}
    @since 3.6 *)

val chars_fold :
  f:('acc -> char ->
     [`Continue of 'acc | `Consume_and_stop of 'acc | `Stop of 'acc | `Fail of string]) ->
  'acc ->
  ('acc * slice) t
(** [chars_fold f acc0] folds over characters of the input.
    Each char [c] is passed, along with the current accumulator, to [f];
    [f] can either:

    - stop, by returning [`Stop acc]. In this case the final accumulator [acc]
      is returned, and [c] is not consumed.
    - consume char and stop, by returning [`Consume_and_stop acc].
    - fail, by returning [`Fail msg]. In this case the parser fails
      with the given message.
    - continue, by returning [`Continue acc]. The parser continues to the
      next char with the new accumulator.

    This is a generalization of of {!chars_if} that allows one to transform
    characters on the fly, skip some, handle escape sequences, etc.
    It can also be useful as a base component for a lexer.

   @return a pair of the final accumular, and the slice matched by the fold.
   @since 3.6 *)

val chars_fold_transduce :
  f:('acc -> char ->
     [ `Continue of 'acc | `Yield of 'acc * char
     | `Consume_and_stop | `Stop | `Fail of string]) ->
  'acc ->
  ('acc * string) t
(** Same as {!char_fold} but with the following differences:

    - returns a string along with the accumulator, rather than the slice
      of all the characters accepted by [`Continue _].
      The string is built from characters returned by [`Yield].
    - new case [`Yield (acc, c)] adds [c] to the returned string
      and continues parsing with [acc].

    @since 3.6 *)

val take : int -> slice t
(** [take len] parses exactly [len] characters from the input.
    Fails if the input doesn't contain at least [len] chars.
    @since 3.6 *)

val take_if : (char -> bool) -> slice t
(** [take_if f] takes characters as long as they satisfy the predicate [f].
    @since 3.6 *)

val take1_if : ?descr:string -> (char -> bool) -> slice t
(** [take1_if f] takes characters as long as they satisfy the predicate [f].
    Fails if no character satisfies [f].
    @param descr describes what kind of character was expected, in case of error
    @since 3.6 *)

val char_if : ?descr:string -> (char -> bool) -> char t
(** [char_if f] parses a character [c] if [f c = true].
    Fails if  the next char does not satisfy [f].
    @param descr describes what kind of character was expected, in case of error *)

val chars_if : (char -> bool) -> string t
(** [chars_if f] parses a string of chars that satisfy [f].
    Cannot fail. *)

val chars1_if : ?descr:string -> (char -> bool) -> string t
(** Like {!chars_if}, but accepts only non-empty strings.
    [chars1_if p] fails if the string accepted by [chars_if p] is empty.
    [chars1_if p] is equivalent to [take1_if p >|= Slice.to_string].
    @param descr describes what kind of character was expected, in case of error *)

val endline : char t
(** Parse '\n'. *)

val space : char t
(** Tab or space. *)

val white : char t
(** Tab or space or newline. *)

val skip_chars : (char -> bool) -> unit t
(** Skip 0 or more chars satisfying the predicate. *)

val skip_space : unit t
(** Skip ' ' and '\t'. *)

val skip_white : unit t
(** Skip ' ' and '\t' and '\n'. *)

val is_alpha : char -> bool
(** Is the char a letter? *)

val is_num : char -> bool
(** Is the char a digit? *)

val is_alpha_num : char -> bool
(** Is the char a letter or a digit? *)

val is_space : char -> bool
(** True on ' ' and '\t'. *)

val is_white : char -> bool
(** True on ' ' and '\t' and '\n'. *)

val suspend : (unit -> 'a t) -> 'a t
(** [suspend f] is  the same as [f ()], but evaluates [f ()] only
    when needed.

    A practical use case is to implement recursive parsers manually,
    as described in {!fix}. The parser is [let rec p () = …],
    and [suspend p] can be used in the definition to use [p].
*)

val string : string -> string t
(** [string s] parses exactly the string [s], and nothing else. *)

val exact : string -> string t
(** Alias to {!string}.
    @since 3.6 *)

val many : 'a t -> 'a list t
(** [many p] parses [p] repeatedly, until [p] fails, and
    collects the results into a list. *)

val optional : _ t -> unit t
(** [optional p] tries to parse [p], and return [()] whether it
    succeeded or failed. Cannot fail itself.
    It consumes input if [p] succeeded (as much as [p] consumed), but
    consumes not input if [p] failed.
    @since 3.6 *)

val try_ : 'a t -> 'a t
[@@deprecated "plays no role anymore, just replace [try foo] with [foo]"]
(** [try_ p] is just like [p] (it used to play a role in backtracking
    semantics but no more).

    @deprecated since 3.6 it can just be removed. See {!try_opt} if you want
    to detect failure. *)

val try_opt : 'a t -> 'a option t
(** [try_opt p] tries to parse using [p], and return [Some x] if [p]
    succeeded with [x] (and consumes what [p] consumed).
    Otherwise it returns [None] and consumes nothing. This cannot fail.
    @since 3.6 *)

val many_until : until:_ t -> 'a t -> 'a list t
(** [many_until ~until p] parses as many [p] as it can until
    the [until] parser successfully returns.
    If [p] fails before that then [many_until ~until p] fails as well.
    Typically [until] can be a closing ')' or another termination condition,
    and what is consumed by [until] is also consumed by [many_until ~until p].

    {b EXPERIMENTAL}

    @since 3.6 *)

val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t
(** [try_or p1 ~f ~else_:p2] attempts to parse [x] using [p1],
    and then becomes [f x].
    If [p1] fails, then it becomes [p2]. This can be useful if [f] is expensive
    but only ever works if [p1] matches (e.g. after an opening parenthesis
    or some sort of prefix).
    @since 3.6
*)

val try_or_l :
  ?msg:string ->
  ?else_:'a t ->
  (unit t * 'a t) list ->
  'a t
(** [try_or_l ?else_ l] tries each pair [(test, p)] in order.
    If the n-th [test] succeeds, then [try_or_l l] behaves like n-th [p],
    whether [p] fails or not. If [test] consumes input, the state is restored
    before calling [p].
    If they all fail, and [else_] is defined, then it behaves like [else_].
    If all fail, and [else_] is [None], then it fails as well.

    This is a performance optimization compared to {!(<|>)}. We commit to a
    branch if the test succeeds, without backtracking at all.
    It can also provide better error messages, because failures in the parser
    will not be reported as failures in [try_or_l].

    See {!lookahead_ignore} for a convenient way of writing the test conditions.

    @param msg error message if all options fail

    {b EXPERIMENTAL}
    @since 3.6 *)

val or_ : 'a t -> 'a t -> 'a t
(** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2]
    from the same position.
    @since 3.6 *)

val both : 'a t -> 'b t -> ('a * 'b) t
(** [both a b] parses [a], then [b], then returns the pair of their results.
    @since 3.6 *)

val many1 : 'a t -> 'a list t
(** [many1 p] is like [many p] excepts it fails if the
    list is empty (i.e. it needs [p] to succeed at least once). *)

val skip : _ t -> unit t
(** [skip p] parses zero or more times [p] and ignores its result.
    It is eager, meaning it will continue as long as [p] succeeds.
    As soon as [p] fails, [skip p] stops consuming any input. *)

val sep : by:_ t -> 'a t -> 'a list t
(** [sep ~by p] parses a list of [p] separated by [by]. *)

val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t
(** Same as {!sep} but stop when [until] parses successfully.
    @since 3.6 *)

val sep1 : by:_ t -> 'a t -> 'a list t
(** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *)

val lookahead : 'a t -> 'a t
(** [lookahead p] behaves like [p], except it doesn't consume any input.

    {b EXPERIMENTAL}
    @since 3.6 *)

val lookahead_ignore : 'a t -> unit t
(** [lookahead_ignore p] tries to parse input with [p],
    and succeeds if [p] succeeds. However it doesn't consume any input
    and returns [()], so in effect its only use-case is to detect
    whether [p] succeeds, e.g. in {!try_or_l}.

    {b EXPERIMENTAL}
    @since 3.6 *)

val fix : ('a t -> 'a t) -> 'a t
(** Fixpoint combinator. [fix (fun self -> p)] is the parser [p],
    in which [self] refers to the parser [p] itself (which is useful to
    parse recursive structures.

    An alternative, manual implementation to [let p = fix (fun self -> q)]
    is:
    {[ let rec p () =
        let self = suspend p in
        q
    ]}
*)

val line : slice t
(** Parse a line, ['\n'] excluded, and position the cursor after the ['\n'].
    @since 3.6 *)

val line_str : string t
(** [line_str] is [line >|= Slice.to_string].
    It parses the next line and turns the slice into a string.
    The state points to the character immediately after the ['\n'] character.
    @since 3.6 *)

val each_line : 'a t -> 'a list t
(** [each_line p] runs [p] on each line of the input.
    {b EXPERIMENTAL}
    @since 3.6 *)

val split_1 : on_char:char -> (slice * slice option) t
(** [split_1 ~on_char] looks for [on_char] in the input, and returns a
    pair [sl1, sl2], where:

    - [sl1] is the slice of the input the precedes the first occurrence
      of [on_char], or the whole input if [on_char] cannot be found.
      It does not contain [on_char].
    - [sl2] is the slice that comes after [on_char],
      or [None] if [on_char] couldn't be found. It doesn't contain the first
      occurrence of [on_char] (if any).

    The parser is now positioned at the end of the input.

    {b EXPERIMENTAL}
    @since 3.6 *)

val split_list : on_char:char -> slice list t
(** [split_list ~on_char] splits the input on all occurrences of [on_char],
    returning a list of slices.

    {b EXPERIMENTAL}
    @since 3.6 *)

val split_list_at_most : on_char:char -> int -> slice list t
(** [split_list_at_most ~on_char n] applies [split_1 ~on_char] at most
    [n] times, to get a list of [n+1] elements.
    The last element might contain [on_char]. This is useful to limit the
    amount of work done by {!split_list}.

    {b EXPERIMENTAL}
    @since 3.6 *)


val split_2 : on_char:char -> (slice * slice) t
(** [split_2 ~on_char] splits the input into exactly 2 fields,
    and fails if the split yields less or more than 2 items.
    {b EXPERIMENTAL}
    @since 3.6 *)

val split_3 : on_char:char -> (slice * slice * slice) t
(** See {!split_2}
    {b EXPERIMENTAL}
    @since 3.6 *)

val split_4 : on_char:char -> (slice * slice * slice * slice) t
(** See {!split_2}
    {b EXPERIMENTAL}
    @since 3.6 *)

val each_split : on_char:char -> 'a t -> 'a list t
(** [split_list_map ~on_char p] uses [split_list ~on_char] to split
    the input, then parses each chunk of the input thus obtained using [p].

    The difference with [sep ~by:(char on_char) p] is that
    [sep] calls [p] first, and only tries to find [on_char] after [p] returns.
    While it is more flexible, this technique also means [p] has to be careful
    not to consume [on_char] by error.

    A useful specialization of this is {!each_line}, which is
    basically [each_split ~on_char:'\n' p].

    {b EXPERIMENTAL}
    @since 3.6 *)

val all : slice t
(** [all] returns all the unconsumed input as a slice, and consumes it.
    Use {!Slice.to_string} to turn it into a string.

    Note that [lookahead all] can be used to {i peek} at the rest of the input
    without consuming anything.

    @since 3.6 *)

val all_str : string t
(** [all_str] accepts all the remaining chars and extracts them into a
    string. Similar to {!all} but with a string.

    {b EXPERIMENTAL}
    @since 3.6 *)

(* TODO
val trim : slice t
(** [trim] is like {!all}, but removes whitespace on the left and right.
   {b EXPERIMENTAL}
    @since 3.6 *)
 *)

val memo : 'a t -> 'a t
(** Memoize the parser. [memo p] will behave like [p], but when called
    in a state (read: position in input) it has already processed, [memo p]
    returns a result directly. The implementation uses an underlying
    hashtable.
    This can be costly in memory, but improve the run time a lot if there
    is a lot of backtracking involving [p].

    Do not call {!memo} inside other functions, especially with {!(>>=)},
    {!map}, etc. being so prevalent. Instead the correct way to use it
    is in a toplevel definition:

    {[
      let my_expensive_parser = memo (foo *> bar >>= fun i -> …)
    ]}

    This function is not thread-safe. *)

val fix_memo : ('a t -> 'a t) -> 'a t
(** Like {!fix}, but the fixpoint is memoized. *)

(** {2 Infix} *)

module Infix : sig
  val (>|=) : 'a t -> ('a -> 'b) -> 'b t
  (** Alias to {!map}. [p >|= f] parses an item [x] using [p],
      and returns [f x]. *)

  val (>>=) : 'a t -> ('a -> 'b t) -> 'b t
  (** Alias to {!bind}.
      [p >>= f] results in a new parser which behaves as [p] then,
      in case of success, applies [f] to the result. *)

  val (<*>) : ('a -> 'b) t -> 'a t -> 'b t
  (** Applicative. *)

  val (<* ) : 'a t -> _ t -> 'a t
  (** [a <* b] parses [a] into [x], parses [b] and ignores its result,
      and returns [x]. *)

  val ( *>) : _ t -> 'a t -> 'a t
  (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The
      result of [a] is ignored. *)

  val (<|>) : 'a t -> 'a t -> 'a t
  (** Alias to {!or_}.

      [a <|> b] tries to parse [a], and if [a] fails without
      consuming any input, backtracks and tries
      to parse [b], otherwise it fails as [a]. *)

  val (<?>) : 'a t -> string -> 'a t
  (** [a <?> msg] behaves like [a], but if [a] fails,
      [a <?> msg] fails with [msg] instead.
      Useful as the last choice in a series of [<|>]. For example:
      [a <|> b <|> c <?> "expected one of a, b, c"]. *)

  val (|||) : 'a t -> 'b t -> ('a * 'b) t
  (** Alias to {!both}.
      [a ||| b] parses [a], then [b], then returns the pair of their results.
      @since 3.6 *)

  (** Let operators on OCaml >= 4.08.0, nothing otherwise
      @since 2.8
      @inline *)
  include CCShimsMkLet_.S with type 'a t_let := 'a t
end

include module type of Infix

(** {2 Parse input} *)

val stringify_result : 'a or_error -> ('a, string) result
(** Turn a {!Error.t}-oriented result into a more basic string result.
    @since 3.6 *)

val parse_string : 'a t -> string -> ('a, string) result
(** Parse a string using the parser. *)

val parse_string_e : 'a t -> string -> 'a or_error
(** Version of {!parse_string} that returns a more detailed error. *)

val parse_string_exn : 'a t -> string -> 'a
(**  @raise ParseError if it fails. *)

val parse_file : 'a t -> string -> ('a, string) result
(** [parse_file p filename] parses file named [filename] with [p]
    by opening the file and reading it whole. *)

val parse_file_e : 'a t -> string -> 'a or_error
(** Version of {!parse_file} that returns a more detailed error. *)

val parse_file_exn : 'a t -> string -> 'a
(** Same as {!parse_file}, but
    @raise ParseError if it fails. *)


(** {2 Utils}

    This is useful to parse OCaml-like values in a simple way.
    All the parsers are whitespace-insensitive (they skip whitespace). *)
module U : sig
  val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t
  (** [list p] parses a list of [p], with the OCaml conventions for
      start token "\[", stop token "\]" and separator ";".
      Whitespace between items are skipped. *)

  (* TODO: parse option? *)
  (* TODO: split on whitespace? *)

  val int : int t
  (** Parse an int in decimal representation. *)

  val in_paren : 'a t -> 'a t
  (** [in_paren p] parses an opening "(",[p] , and then ")".
      @since 3.6 *)

  val in_parens_opt : 'a t -> 'a t
  (** [in_parens_opt p] parses [p] in an arbitrary number of nested
      parenthesis (possibly 0).
      @since 3.6 *)

  val option : 'a t -> 'a option t
  (** [option p] parses "Some <x>" into [Some x]  if [p] parses "<x>" into [x],
      and parses "None" into [None].
      @since 3.6 *)

  val hexa_int : int t
  (** Parse an int int hexadecimal format. Accepts an optional [0x] prefix,
      and ignores capitalization.
      @since 3.6 *)

  val word : string t
  (** Non empty string of alpha num, start with alpha. *)

  val bool : bool t
  (** Accepts "true" or "false"
      @since 3.6 *)

  (* TODO: quoted string *)

  val pair : ?start:string -> ?stop:string -> ?sep:string ->
    'a t -> 'b t -> ('a * 'b) t
  (** Parse a pair using OCaml syntactic conventions.
      The default is "(a, b)". *)

  val triple : ?start:string -> ?stop:string -> ?sep:string ->
    'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t
  (** Parse a triple using OCaml syntactic conventions.
      The default is "(a, b, c)". *)
end

(** Debugging utils.
    {b EXPERIMENTAL}
    @since 3.6 *)
module Debug_ : sig
  val trace_fail : string -> 'a t -> 'a t
  (** [trace_fail name p] behaves like [p], but prints the error message of [p]
      on stderr whenever [p] fails.
      @param name used as a prefix of all trace messages. *)

  val trace_success : string -> print:('a -> string) -> 'a t -> 'a t
  (** [trace_success name ~print p] behaves like [p], but
      prints successful runs of [p] using [print]. *)

  val trace_success_or_fail : string -> print:('a -> string) -> 'a t -> 'a t
      (** Trace both error or success *)
end