mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 03:05:28 -05:00
423 lines
11 KiB
OCaml
423 lines
11 KiB
OCaml
|
|
(* This file is free software. See file "license" for more details. *)
|
|
|
|
(** {1 Very Simple Parser Combinators}
|
|
|
|
{[
|
|
open CCParse;;
|
|
|
|
type tree = L of int | N of tree * tree;;
|
|
|
|
let mk_leaf x = L x
|
|
let mk_node x y = N(x,y)
|
|
|
|
let ptree = fix @@ fun self ->
|
|
skip_space *>
|
|
( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')')
|
|
<|>
|
|
(U.int >|= mk_leaf) )
|
|
;;
|
|
|
|
parse_string_exn ptree "(1 (2 3))" ;;
|
|
parse_string_exn ptree "((1 2) (3 (4 5)))" ;;
|
|
|
|
]}
|
|
|
|
{6 Parse a list of words}
|
|
|
|
{[
|
|
open Containers.Parse;;
|
|
let p = U.list ~sep:"," U.word;;
|
|
parse_string_exn p "[abc , de, hello ,world ]";;
|
|
]}
|
|
|
|
{6 Stress Test}
|
|
This makes a list of 100_000 integers, prints it and parses it back.
|
|
|
|
{[
|
|
let p = CCParse.(U.list ~sep:"," U.int);;
|
|
|
|
let l = CCList.(1 -- 100_000);;
|
|
let l_printed =
|
|
CCFormat.(to_string (within "[" "]" (list ~sep:(return ",@,") int))) l;;
|
|
|
|
let l' = CCParse.parse_string_exn p l_printed;;
|
|
|
|
assert (l=l');;
|
|
]}
|
|
|
|
*)
|
|
|
|
(*$inject
|
|
module T = struct
|
|
type tree = L of int | N of tree * tree
|
|
end
|
|
open T
|
|
open Result
|
|
|
|
let mk_leaf x = L x
|
|
let mk_node x y = N(x,y)
|
|
|
|
let ptree = fix @@ fun self ->
|
|
skip_space *>
|
|
( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')')
|
|
<|>
|
|
(U.int >|= mk_leaf) )
|
|
|
|
let ptree' = fix_memo @@ fun self ->
|
|
skip_space *>
|
|
( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')')
|
|
<|>
|
|
(U.int >|= mk_leaf) )
|
|
|
|
let rec pptree = function
|
|
| N (a,b) -> Printf.sprintf "N (%s, %s)" (pptree a) (pptree b)
|
|
| L x -> Printf.sprintf "L %d" x
|
|
|
|
let errpptree = function
|
|
| Ok x -> "Ok " ^ pptree x
|
|
| Error s -> "Error " ^ s
|
|
*)
|
|
|
|
(*$= & ~printer:errpptree
|
|
(Ok (N (L 1, N (L 2, L 3)))) \
|
|
(parse_string ptree "(1 (2 3))" )
|
|
(Ok (N (N (L 1, L 2), N (L 3, N (L 4, L 5))))) \
|
|
(parse_string ptree "((1 2) (3 (4 5)))" )
|
|
(Ok (N (L 1, N (L 2, L 3)))) \
|
|
(parse_string ptree' "(1 (2 3))" )
|
|
(Ok (N (N (L 1, L 2), N (L 3, N (L 4, L 5))))) \
|
|
(parse_string ptree' "((1 2) (3 (4 5)))" )
|
|
*)
|
|
|
|
(*$R
|
|
let p = U.list ~sep:"," U.word in
|
|
let printer = function
|
|
| Ok l -> "Ok " ^ CCFormat.(to_string (list string)) l
|
|
| Error s -> "Error " ^ s
|
|
in
|
|
assert_equal ~printer
|
|
(Ok ["abc"; "de"; "hello"; "world"])
|
|
(parse_string p "[abc , de, hello ,world ]");
|
|
*)
|
|
|
|
(*$R
|
|
let test n =
|
|
let p = CCParse.(U.list ~sep:"," U.int) in
|
|
|
|
let l = CCList.(1 -- n) in
|
|
let l_printed =
|
|
CCFormat.(to_string (within "[" "]" (list ~sep:(return ",") int))) l in
|
|
|
|
let l' = CCParse.parse_string_exn p l_printed in
|
|
|
|
assert_equal ~printer:Q.Print.(list int) l l'
|
|
in
|
|
test 100_000;
|
|
test 400_000;
|
|
|
|
*)
|
|
|
|
(*$R
|
|
let open CCParse.Infix in
|
|
let module P = CCParse in
|
|
|
|
let parens p = P.try_ (P.char '(') *> p <* P.char ')' in
|
|
let add = P.char '+' *> P.return (+) in
|
|
let sub = P.char '-' *> P.return (-) in
|
|
let mul = P.char '*' *> P.return ( * ) in
|
|
let div = P.char '/' *> P.return ( / ) in
|
|
let integer =
|
|
P.chars1_if (function '0'..'9'->true|_->false) >|= int_of_string in
|
|
|
|
let chainl1 e op =
|
|
P.fix (fun r ->
|
|
e >>= fun x -> P.try_ (op <*> P.return x <*> r) <|> P.return x) in
|
|
|
|
let expr : int P.t =
|
|
P.fix (fun expr ->
|
|
let factor = parens expr <|> integer in
|
|
let term = chainl1 factor (mul <|> div) in
|
|
chainl1 term (add <|> sub)) in
|
|
|
|
assert_equal (Ok 6) (P.parse_string expr "4*1+2");
|
|
assert_equal (Ok 12) (P.parse_string expr "4*(1+2)");
|
|
()
|
|
*)
|
|
|
|
|
|
type 'a or_error = ('a, string) Result.result
|
|
|
|
type line_num = int
|
|
type col_num = int
|
|
|
|
type parse_branch
|
|
|
|
val string_of_branch : parse_branch -> string
|
|
|
|
exception ParseError of parse_branch * (unit -> string)
|
|
(** parsing branch * message. *)
|
|
|
|
(** {2 Input} *)
|
|
|
|
type position
|
|
|
|
type state
|
|
|
|
val state_of_string : string -> state
|
|
|
|
(** {2 Combinators} *)
|
|
|
|
type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit
|
|
(** Takes the input and two continuations:
|
|
{ul
|
|
{- [ok] to call with the result when it's done}
|
|
{- [err] to call when the parser met an error}
|
|
}
|
|
@raise ParseError in case of failure. *)
|
|
|
|
val return : 'a -> 'a t
|
|
(** Always succeeds, without consuming its input. *)
|
|
|
|
val pure : 'a -> 'a t
|
|
(** Synonym to {!return}. *)
|
|
|
|
val (>|=) : 'a t -> ('a -> 'b) -> 'b t
|
|
(** Map. *)
|
|
|
|
val map : ('a -> 'b) -> 'a t -> 'b t
|
|
|
|
val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t
|
|
|
|
val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t
|
|
|
|
val (>>=) : 'a t -> ('a -> 'b t) -> 'b t
|
|
(** Monadic bind.
|
|
[p >>= f] results in a new parser which behaves as [p] then,
|
|
in case of success, applies [f] to the result. *)
|
|
|
|
val (<*>) : ('a -> 'b) t -> 'a t -> 'b t
|
|
(** Applicative. *)
|
|
|
|
val (<* ) : 'a t -> _ t -> 'a t
|
|
(** [a <* b] parses [a] into [x], parses [b] and ignores its result,
|
|
and returns [x]. *)
|
|
|
|
val ( *>) : _ t -> 'a t -> 'a t
|
|
(** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The
|
|
results of [a] is ignored. *)
|
|
|
|
val fail : string -> 'a t
|
|
(** [fail msg] fails with the given message. It can trigger a backtrack. *)
|
|
|
|
val failf: ('a, unit, string, 'b t) format4 -> 'a
|
|
(** [Format.sprintf] version of {!fail}. *)
|
|
|
|
val parsing : string -> 'a t -> 'a t
|
|
(** [parsing s p] behaves the same as [p], with the information that
|
|
we are parsing [s], if [p] fails. *)
|
|
|
|
val eoi : unit t
|
|
(** Expect the end of input, fails otherwise. *)
|
|
|
|
val nop : unit t
|
|
(** Succeed with [()]. *)
|
|
|
|
val char : char -> char t
|
|
(** [char c] parses the character [c] and nothing else. *)
|
|
|
|
val char_if : (char -> bool) -> char t
|
|
(** [char_if f] parses a character [c] if [f c = true]. *)
|
|
|
|
val chars_if : (char -> bool) -> string t
|
|
(** [chars_if f] parses a string of chars that satisfy [f]. *)
|
|
|
|
val chars1_if : (char -> bool) -> string t
|
|
(** Like {!chars_if}, but only non-empty strings. *)
|
|
|
|
val endline : char t
|
|
(** Parses '\n'. *)
|
|
|
|
val space : char t
|
|
(** Tab or space. *)
|
|
|
|
val white : char t
|
|
(** Tab or space or newline. *)
|
|
|
|
val skip_chars : (char -> bool) -> unit t
|
|
(** Skip 0 or more chars satisfying the predicate. *)
|
|
|
|
val skip_space : unit t
|
|
(** Skip ' ' and '\t'. *)
|
|
|
|
val skip_white : unit t
|
|
(** Skip ' ' and '\t' and '\n'. *)
|
|
|
|
val is_alpha : char -> bool
|
|
(** Is the char a letter? *)
|
|
|
|
val is_num : char -> bool
|
|
(** Is the char a digit? *)
|
|
|
|
val is_alpha_num : char -> bool
|
|
(** Is the char a letter or a digit? *)
|
|
|
|
val is_space : char -> bool
|
|
(** True on ' ' and '\t'. *)
|
|
|
|
val is_white : char -> bool
|
|
(** True on ' ' and '\t' and '\n'. *)
|
|
|
|
val (<|>) : 'a t -> 'a t -> 'a t
|
|
(** [a <|> b] tries to parse [a], and if [a] fails without
|
|
consuming any input, backtracks and tries
|
|
to parse [b], otherwise it fails as [a].
|
|
See {!try_} to ensure [a] does not consume anything (but it is best
|
|
to avoid wrapping large parsers with {!try_}). *)
|
|
|
|
val (<?>) : 'a t -> string -> 'a t
|
|
(** [a <?> msg] behaves like [a], but if [a] fails without
|
|
consuming any input, it fails with [msg]
|
|
instead. Useful as the last choice in a series of [<|>]:
|
|
[a <|> b <|> c <?> "expected a|b|c"]. *)
|
|
|
|
val try_ : 'a t -> 'a t
|
|
(** [try_ p] tries to parse like [p], but backtracks if [p] fails.
|
|
Useful in combination with [<|>]. *)
|
|
|
|
val suspend : (unit -> 'a t) -> 'a t
|
|
(** [suspend f] is the same as [f ()], but evaluates [f ()] only
|
|
when needed. *)
|
|
|
|
val string : string -> string t
|
|
(** [string s] parses exactly the string [s], and nothing else. *)
|
|
|
|
val many : 'a t -> 'a list t
|
|
(** [many p] parses a list of [p], eagerly (as long as possible). *)
|
|
|
|
val many1 : 'a t -> 'a list t
|
|
(** Parse a non-empty list. *)
|
|
|
|
val skip : _ t -> unit t
|
|
(** [skip p] parses zero or more times [p] and ignores its result. *)
|
|
|
|
val sep : by:_ t -> 'a t -> 'a list t
|
|
(** [sep ~by p] parses a list of [p] separated by [by]. *)
|
|
|
|
val sep1 : by:_ t -> 'a t -> 'a list t
|
|
(** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *)
|
|
|
|
val fix : ('a t -> 'a t) -> 'a t
|
|
(** Fixpoint combinator. *)
|
|
|
|
val memo : 'a t -> 'a t
|
|
(** Memoize the parser. [memo p] will behave like [p], but when called
|
|
in a state (read: position in input) it has already processed, [memo p]
|
|
returns a result directly. The implementation uses an underlying
|
|
hashtable.
|
|
This can be costly in memory, but improve the run time a lot if there
|
|
is a lot of backtracking involving [p].
|
|
|
|
This function is not thread-safe. *)
|
|
|
|
val fix_memo : ('a t -> 'a t) -> 'a t
|
|
(** Like {!fix}, but the fixpoint is memoized. *)
|
|
|
|
val get_lnum : int t
|
|
(** Reflect the current line number. *)
|
|
|
|
val get_cnum : int t
|
|
(** Reflect the current column number. *)
|
|
|
|
val get_pos : (int * int) t
|
|
(** Reflect the current (line, column) numbers. *)
|
|
|
|
(** {2 Parse}
|
|
|
|
Those functions have a label [~p] on the parser, since 0.14.
|
|
*)
|
|
|
|
val parse : 'a t -> state -> 'a or_error
|
|
(** [parse p st] applies [p] on the input, and returns [Ok x] if
|
|
[p] succeeds with [x], or [Error s] otherwise. *)
|
|
|
|
val parse_exn : 'a t -> state -> 'a
|
|
(** Unsafe version of {!parse}.
|
|
@raise ParseError if it fails. *)
|
|
|
|
val parse_string : 'a t -> string -> 'a or_error
|
|
(** Specialization of {!parse} for string inputs. *)
|
|
|
|
val parse_string_exn : 'a t -> string -> 'a
|
|
(** @raise ParseError if it fails. *)
|
|
|
|
val parse_file : 'a t -> string -> 'a or_error
|
|
(** [parse_file p file] parses [file] with [p] by opening the file
|
|
and reading it whole. *)
|
|
|
|
val parse_file_exn : 'a t -> string -> 'a
|
|
(** @raise ParseError if it fails. *)
|
|
|
|
(** {2 Infix} *)
|
|
|
|
module Infix : sig
|
|
val (>|=) : 'a t -> ('a -> 'b) -> 'b t
|
|
(** Map. *)
|
|
|
|
val (>>=) : 'a t -> ('a -> 'b t) -> 'b t
|
|
(** Monadic bind.
|
|
[p >>= f] results in a new parser which behaves as [p] then,
|
|
in case of success, applies [f] to the result. *)
|
|
|
|
val (<*>) : ('a -> 'b) t -> 'a t -> 'b t
|
|
(** Applicative. *)
|
|
|
|
val (<* ) : 'a t -> _ t -> 'a t
|
|
(** [a <* b] parses [a] into [x], parses [b] and ignores its result,
|
|
and returns [x]. *)
|
|
|
|
val ( *>) : _ t -> 'a t -> 'a t
|
|
(** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The
|
|
results of [a] is ignored. *)
|
|
|
|
val (<|>) : 'a t -> 'a t -> 'a t
|
|
(** [a <|> b] tries to parse [a], and if [a] fails without
|
|
consuming any input, backtracks and tries
|
|
to parse [b], otherwise it fails as [a].
|
|
See {!try_} to ensure [a] does not consume anything (but it is best
|
|
to avoid wrapping large parsers with {!try_}). *)
|
|
|
|
val (<?>) : 'a t -> string -> 'a t
|
|
(** [a <?> msg] behaves like [a], but if [a] fails without
|
|
consuming any input, it fails with [msg]
|
|
instead. Useful as the last choice in a series of [<|>]:
|
|
[a <|> b <|> c <?> "expected a|b|c"]. *)
|
|
|
|
end
|
|
|
|
(** {2 Utils}
|
|
|
|
This is useful to parse OCaml-like values in a simple way. *)
|
|
|
|
module U : sig
|
|
val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t
|
|
(** [list p] parses a list of [p], with the OCaml conventions for
|
|
start token "[", stop token "]" and separator ";".
|
|
Whitespace between items are skipped. *)
|
|
|
|
val int : int t
|
|
(** Parse an int. *)
|
|
|
|
val word : string t
|
|
(** Non empty string of alpha num, start with alpha. *)
|
|
|
|
val pair : ?start:string -> ?stop:string -> ?sep:string ->
|
|
'a t -> 'b t -> ('a * 'b) t
|
|
(** Parse a pair using OCaml whitespace conventions.
|
|
The default is "(a, b)". *)
|
|
|
|
val triple : ?start:string -> ?stop:string -> ?sep:string ->
|
|
'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t
|
|
(** Parse a triple using OCaml whitespace conventions.
|
|
The default is "(a, b, c)". *)
|
|
end
|