CCParse: add slice and the ability to recurse on them

the idea is that it's often convenient to split the input into smaller
part (e.g. lines), or do a first pass of parsing that just returns a
slice of the input; and then later to use another parser on that slice
to extract the actual data. The new notion of `slice` allows that,
while preserving locations wrt the original input.
This commit is contained in:
Simon Cruanes 2021-06-06 22:42:20 -04:00
parent 7bdc3cff24
commit 1517f64f55
2 changed files with 427 additions and 84 deletions

View file

@ -167,7 +167,7 @@ module Error = struct
Format.fprintf out "@[<hv>at line %d, char %d:@ %s@]" line col (self.msg())
end
type 'a or_error = ('a, Error.t) result
type +'a or_error = ('a, Error.t) result
module Memo_tbl = Hashtbl.Make(struct
type t = int * int (* id of parser, position *)
@ -186,7 +186,8 @@ end
(** Purely functional state passed around *)
type state = {
str: string; (* the input *)
i: int; (* offset in [input.str] *)
i: int; (* offset in [str] *)
j: int; (* end pointer in [str], excluded. [len = j-i] *)
memo : Memo_state.t option ref; (* Memoization table, if any *)
}
@ -207,11 +208,12 @@ let state_of_string str =
let s = {
str;
i=0;
j=String.length str;
memo=ref None;
} in
s
let[@inline] is_done st = st.i >= String.length st.str
let[@inline] is_done st = st.i >= st.j
let[@inline] cur st = st.str.[st.i]
let pos_of_st_ st : position = {pos_buffer=st.str; pos_offset=st.i; pos_lc=None}
@ -328,7 +330,7 @@ let map2 f x y = pure f <*> x <*> y
let map3 f x y z = pure f <*> x <*> y <*> z
let junk_ (st:state) : state =
assert (st.i < String.length st.str);
assert (st.i < st.j);
{st with i=st.i + 1}
let eoi = {
@ -344,11 +346,61 @@ let eoi = {
(Ok true) (parse_string (U.bool <* skip_white <* eoi) "true")
*)
let with_pos p : _ t = {
run=fun st ~ok ~err ->
p.run st
~ok:(fun st' x ->
ok st' (x, pos_of_st_ st))
~err
}
(* a slice is just a state, which makes {!recurse} quite easy. *)
type slice = state
module Slice = struct
type t = slice
let length sl = sl.j - sl.i
let is_empty sl = sl.i = sl.j
let to_string sl = String.sub sl.str sl.i (length sl)
end
let recurse slice p : _ t = {
run=fun _st ~ok ~err ->
(* make sure these states are related. all slices share the
same reference as the initial state they derive from. *)
assert CCShims_.Stdlib.(_st.memo == slice.memo);
p.run slice ~ok ~err
}
let all = {
run=fun st ~ok ~err:_ ->
if is_done st then ok st st
else (
let st_done = {st with i=st.j} in
ok st_done st
)
}
let all_str = all >|= Slice.to_string
(*$= & ~printer:(errpp Q.Print.string) ~cmp:(erreq (=))
(Ok "abcd") (parse_string all_str "abcd")
(Ok "cd") (parse_string (string "ab" *> all_str) "ab")
*)
(*$= & ~printer:(errpp Q.Print.(pair string string)) ~cmp:(erreq (=))
(Ok ("foobar", "")) (parse_string (both all_str all_str) "foobar")
*)
let fail msg : _ t = {
run=fun st ~ok:_ ~err ->
err (mk_error_ st (const_str_ msg))
}
let failf msg = Printf.ksprintf fail msg
let fail_lazy msg = {
run=fun st ~ok:_ ~err ->
err (mk_error_ st msg)
}
let parsing what p = {
run=fun st ~ok ~err ->
@ -360,9 +412,10 @@ let parsing what p = {
err {e with Error.msg})
}
let nop = {
let empty = {
run=fun st ~ok ~err:_ -> ok st ();
}
let nop = empty
let any_char = {
run=fun st ~ok ~err -> consume_ st ~ok ~err
@ -398,19 +451,35 @@ let char_if ?descr p = {
~err
}
let chars_if p = {
let take_if p : slice t = {
run=fun st ~ok ~err:_ ->
let i0 = st.i in
let i = ref i0 in
let i = ref st.i in
while
let st = {st with i = !i} in
not (is_done st) && p (cur st)
do
incr i;
done;
ok {st with i = !i} (String.sub st.str i0 (!i - i0))
ok {st with i = !i} {st with j= !i}
}
let take1_if ?descr p =
take_if p >>= fun sl ->
if Slice.is_empty sl then (
let msg() =
let what = match descr with
| None -> ""
| Some d -> Printf.sprintf " for %s" d
in
Printf.sprintf "expected non-empty sequence of chars%s" what
in
fail_lazy msg
) else (
return sl
)
let chars_if p = take_if p >|= Slice.to_string
let chars1_if ?descr p = {
run=fun st ~ok ~err ->
(chars_if p).run st
@ -453,11 +522,11 @@ let chars_fold ~f acc0 = {
| `Fail msg -> raise (Fold_fail (st,msg))
)
done;
ok {st with i= !i} !acc
ok {st with i= !i} (!acc, {st with j= !i})
with Fold_fail (st,msg) -> err (mk_error_ st (const_str_ msg))
}
let chars_fold_map ~f acc0 = {
let chars_fold_transduce ~f acc0 = {
run=fun st ~ok ~err ->
let i0 = st.i in
let i = ref i0 in
@ -524,6 +593,22 @@ let try_or p1 ~f ~else_:p2 = {
~err:(fun _ -> p2.run st ~ok ~err)
}
let try_or_l ?(msg="try_or_l ran out of options") ?else_ l : _ t = {
run=fun st ~ok ~err ->
let rec loop = function
| (test, p) :: tl ->
test.run st
~ok:(fun _ _ -> p.run st ~ok ~err) (* commit *)
~err:(fun _ -> loop tl)
| [] ->
begin match else_ with
| None -> err (mk_error_ st (const_str_ msg))
| Some p -> p.run st ~ok ~err
end
in
loop l
}
let suspend f = {
run=fun st ~ok ~err ->
let p = f () in
@ -531,12 +616,12 @@ let suspend f = {
}
(* read [len] chars at once *)
let any_chars len : _ t = {
let take len : slice t = {
run=fun st ~ok ~err ->
if st.i + len <= String.length st.str then (
let s = String.sub st.str st.i len in
if st.i + len <= st.j then (
let slice = {st with j = st.i + len} in
let st = {st with i = st.i + len} in
ok st s
ok st slice
) else (
let msg() =
Printf.sprintf "expected to be able to consume %d chars" len
@ -545,6 +630,8 @@ let any_chars len : _ t = {
)
}
let any_chars len : _ t = take len >|= Slice.to_string
let exact s = {
run=fun st ~ok ~err ->
(* parse a string of length [String.length s] and compare with [s] *)
@ -697,58 +784,116 @@ let lookahead p : _ t = {
~err
}
let lookahead_ignore p : _ t = {
run=fun st ~ok ~err ->
p.run st
~ok:(fun _st _x -> ok st ())
~err
}
(*$= & ~printer:(errpp Q.Print.(string))
(Ok "abc") (parse_string (lookahead (string "ab") *> (string "abc")) "abcd")
*)
let line : _ t = {
run=fun st ~ok ~err ->
if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI"))
else (
match String.index_from st.str st.i '\n' with
(*$=
(Ok "1234") (parse_string line_str "1234\nyolo")
(Ok ("1234", "yolo")) (parse_string (line_str ||| line_str) "1234\nyolo\nswag")
*)
let split_1 ~on_char : _ t = {
run=fun st ~ok ~err:_ ->
if st.i >= st.j then (
ok st (st, None)
) else (
match String.index_from st.str st.i on_char with
| j ->
let s = String.sub st.str st.i (j - st.i) in
ok {st with i=j+1} s
let x = {st with j} in
let y = {st with i=min st.j (j+1)} in
let st_done = {st with i=st.j} in (* empty *)
ok st_done (x, Some y)
| exception Not_found ->
err (mk_error_ st (const_str_ "unterminated line"))
let st_done = {st with i=st.j} in (* empty *)
ok st_done (st, None)
)
}
(*$=
(Ok "1234") (parse_string line "1234\nyolo")
(Ok ("1234", "yolo")) (parse_string (line ||| line) "1234\nyolo\nswag")
let split_list_at_most ~on_char n : slice list t =
let rec loop acc n =
if n <= 0 then return (List.rev acc)
else (
try_or
eoi
~f:(fun _ -> return (List.rev acc))
~else_:(parse_1 acc n)
)
and parse_1 acc n =
split_1 ~on_char >>= fun (sl1, rest) ->
let acc = sl1 :: acc in
match rest with
| None -> return (List.rev acc)
| Some rest -> recurse rest (loop acc (n-1))
in
loop [] n
(*$= & ~printer:(errpp Q.Print.(list string)) ~cmp:(erreq (=))
(Ok ["a";"b";"c";"d,e,f"]) \
(parse_string (split_list_at_most ~on_char:',' 3 >|= List.map Slice.to_string) "a,b,c,d,e,f")
(Ok ["a";"bc"]) \
(parse_string (split_list_at_most ~on_char:',' 3 >|= List.map Slice.to_string) "a,bc")
*)
(* parse a string [s] using [p_sub], then parse [s] using [p].
The result is that of parsing [s] using [p], but the state is
the one after using [p_sub], and errors are translated back into the context
of [p_sub].
This can be useful for example in [p_sub line some_line_parser]. *)
let parse_sub_ p_sub p : _ t = {
run=fun st0 ~ok ~err ->
let p = p <* eoi in (* make sure [p] reads all *)
p_sub.run st0
~ok:(fun st1 s ->
p.run (state_of_string s)
~ok:(fun _ r -> ok st1 r)
~err:(fun e ->
let pos = e.pos in
let pos = {
pos_buffer=pos.pos_buffer;
pos_offset=pos.pos_offset + st0.i;
pos_lc=None;
} in
err {e with pos}))
~err
}
let split_list ~on_char : _ t =
split_list_at_most ~on_char max_int
let split_2 ~on_char : _ t =
split_list_at_most ~on_char 2 >>= function
| [a; b] -> return (a,b)
| _ -> fail "split_2: expected 2 fields exactly"
let split_3 ~on_char : _ t =
split_list_at_most ~on_char 3 >>= function
| [a; b; c] -> return (a,b,c)
| _ -> fail "split_3: expected 3 fields exactly"
let split_4 ~on_char : _ t =
split_list_at_most ~on_char 4 >>= function
| [a; b; c; d] -> return (a,b,c,d)
| _ -> fail "split_4: expected 4 fields exactly"
let split_list ~on_char : slice list t =
let rec loop acc =
try_or
eoi
~f:(fun _ -> return (List.rev acc))
~else_:(parse_1 acc)
and parse_1 acc =
split_1 ~on_char >>= fun (sl1, rest) ->
let acc = sl1 :: acc in
match rest with
| None -> return (List.rev acc)
| Some rest -> recurse rest (loop acc)
in
loop []
let each_split ~on_char p : 'a list t =
let rec loop acc =
split_1 ~on_char >>= fun (sl1, rest) ->
(* parse [sl1] with [p] *)
recurse sl1 p >>= fun x ->
let acc = x :: acc in
match rest with
| None -> return (List.rev acc)
| Some rest -> recurse rest (loop acc)
in
loop []
let line : slice t =
split_1 ~on_char:'\n' >|= fst
let line_str = line >|= Slice.to_string
let each_line p : _ t =
fix
(fun self ->
try_or eoi
~f:(fun _ -> pure [])
(parse_sub_ line p >>= fun x ->
self >|= fun tl -> x :: tl))
each_split ~on_char:'\n' p
(*$= & ~printer:(errpp Q.Print.(list @@ list int))
(Ok ([[1;1];[2;2];[3;3]])) \

View file

@ -8,6 +8,8 @@
{2 A few examples}
Some more advanced example(s) can be found in the [/examples] directory.
{4 Parse a tree}
{[
@ -104,7 +106,7 @@ module Error : sig
(** Pretty prints the error *)
end
type 'a or_error = ('a, Error.t) result
type +'a or_error = ('a, Error.t) result
(** ['a or_error] is either [Ok x] for some result [x : 'a],
or an error {!Error.t}.
@ -146,21 +148,45 @@ val ap : ('a -> 'b) t -> 'a t -> 'b t
(** Applicative.
@since NEXT_RELEASE *)
val eoi : unit t
(** Expect the end of input, fails otherwise. *)
val nop : unit t
(** Succeed with [()]. *)
val empty : unit t
(** Succeed with [()], same as {!nop}.
@since NEXT_RELEASE *)
val fail : string -> 'a t
(** [fail msg] fails with the given message. It can trigger a backtrack. *)
val failf: ('a, unit, string, 'b t) format4 -> 'a
(** [Format.sprintf] version of {!fail}. *)
val fail_lazy : (unit -> string) -> 'a t
(** Like {!fail}, but only produce an error message on demand.
@since NEXT_RELEASE *)
val parsing : string -> 'a t -> 'a t
(** [parsing s p] behaves the same as [p], with the information that
we are parsing [s], if [p] fails. *)
we are parsing [s], if [p] fails.
The message [s] is added to the error, it does not replace it,
not does the location change (the error still points to
the same location as in [p]). *)
val eoi : unit t
(** Expect the end of input, fails otherwise. *)
val set_error_message : string -> 'a t -> 'a t
(** [set_error_message msg p] behaves like [p], but if [p] fails,
[set_error_message msg p] fails with [msg] instead and at the current
position. The internal error message of [p] is just discarded.
@since NEXT_RELEASE *)
val nop : unit t
(** Succeed with [()]. *)
val with_pos : 'a t -> ('a * position) t
(** [with_pos p] behaves like [p], but returns the (starting) position
along with [p]'s result.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val any_char : char t
(** [any_char] parses any character.
@ -169,27 +195,52 @@ val any_char : char t
val any_chars : int -> string t
(** [any_chars len] parses exactly [len] characters from the input.
Fails if the input doesn't contain at least [len] chars.
@since NEXT_RELEASE *)
val char : char -> char t
(** [char c] parses the character [c] and nothing else. *)
val char_if : ?descr:string -> (char -> bool) -> char t
(** [char_if f] parses a character [c] if [f c = true].
@param descr describes what kind of character was expected *)
type slice
(** A slice of the input, as returned by some combinators such
as {!split_1} or {split_n}.
val chars_if : (char -> bool) -> string t
(** [chars_if f] parses a string of chars that satisfy [f]. *)
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val chars1_if : ?descr:string -> (char -> bool) -> string t
(** Like {!chars_if}, but only non-empty strings.
@param descr describes what kind of character was expected *)
(** Functions on slices.
@since NEXT_RELEASE *)
module Slice : sig
type t = slice
val is_empty : t -> bool
(** Is the slice empty? *)
val length : t -> int
(** Length of the slice *)
val to_string : t -> string
(** Convert the slice into a string.
Linear time and memory in [length slice] *)
end
val recurse : slice -> 'a t -> 'a t
(** [recurse slice p] parses the [slice]
(most likely obtained via another combinator, such as {!split_1}
or {!split_n}), using [p].
The slice contains a position which is used to relocate error
messages to their position in the whole input, not just relative to
the slice.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val chars_fold :
f:('acc -> char ->
[`Continue of 'acc | `Consume_and_stop | `Stop | `Fail of string]) ->
'acc ->
'acc t
('acc * slice) t
(** [chars_fold f acc0] folds over characters of the input.
Each char [c] is passed, along with the current accumulator, to [f];
[f] can either:
@ -204,10 +255,11 @@ val chars_fold :
This is a generalization of of {!chars_if} that allows one to transform
characters on the fly, skip some, handle escape sequences, etc.
It can also be useful as a base component for a lexer.
@since NEXT_RELEASE *)
val chars_fold_map :
val chars_fold_transduce :
f:('acc -> char ->
[ `Continue of 'acc | `Yield of 'acc * char
| `Consume_and_stop | `Stop | `Fail of string]) ->
@ -215,13 +267,42 @@ val chars_fold_map :
('acc * string) t
(** Same as {!char_fold} but with the following differences:
- returns a string along with the accumulator. The string is built from
characters returned by [`Yield].
- returns a string along with the accumulator, rather than the slice
of all the characters accepted by [`Continue _].
The string is built from characters returned by [`Yield].
- new case [`Yield (acc, c)] adds [c] to the returned string
and continues parsing with [acc].
@since NEXT_RELEASE *)
val take : int -> slice t
(** [slice_of_len len] parses exactly [len] characters from the input.
Fails if the input doesn't contain at least [len] chars.
@since NEXT_RELEASE *)
val take_if : (char -> bool) -> slice t
(** [take_if f] takes characters as long as they satisfy the predicate [f].
@since NEXT_RELEASE *)
val take1_if : ?descr:string -> (char -> bool) -> slice t
(** [take1_if f] takes characters as long as they satisfy the predicate [f].
Fails if no character satisfies [f].
@since NEXT_RELEASE *)
val char_if : ?descr:string -> (char -> bool) -> char t
(** [char_if f] parses a character [c] if [f c = true].
Fails if the next char does not satisfy [f].
@param descr describes what kind of character was expected *)
val chars_if : (char -> bool) -> string t
(** [chars_if f] parses a string of chars that satisfy [f].
Cannot fail. *)
val chars1_if : ?descr:string -> (char -> bool) -> string t
(** Like {!chars_if}, but only non-empty strings.
Fails if the string is empty.
@param descr describes what kind of character was expected *)
val endline : char t
(** Parse '\n'. *)
@ -272,7 +353,9 @@ val many : 'a t -> 'a list t
val optional : _ t -> unit t
(** [optional p] tries to parse [p], and return [()] whether it
succeeded or failed. Cannot fail.
succeeded or failed. Cannot fail itself.
It consumes input if [p] succeeded (as much as [p] consumed), but
consumes not input if [p] failed.
@since NEXT_RELEASE *)
val try_ : 'a t -> 'a t
@ -307,6 +390,27 @@ val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t
@since NEXT_RELEASE
*)
val try_or_l :
?msg:string ->
?else_:'a t ->
(unit t * 'a t) list ->
'a t
(** [try_or_l ?else_ l] tries each pair [(test, p)] in order.
If the n-th [test] succeeds, then [try_or_l l] behaves like n-th [p],
whether [p] fails or not.
If they all fail, and [else_] is defined, then it behaves like [else_].
If all fail, and [else_] is [None], then it fails as well.
This is a performance optimization compared to {!(<|>)}. We commit to a
branch if the test succeeds, without backtracking at all.
See {!lookahead_ignore} for a convenient way of writing the test conditions.
@param msg error message if all options fail
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val or_ : 'a t -> 'a t -> 'a t
(** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2]
from the same position.
@ -316,12 +420,6 @@ val both : 'a t -> 'b t -> ('a * 'b) t
(** [both a b] parses [a], then [b], then returns the pair of their results.
@since NEXT_RELEASE *)
val set_error_message : string -> 'a t -> 'a t
(** [set_error_message msg p] behaves like [p], but if [p] fails,
[set_error_message msg p] fails with [msg] instead.
@since NEXT_RELEASE
*)
val many1 : 'a t -> 'a list t
(** [many1 p] is like [many p] excepts it fails if the
list is empty (i.e. it needs [p] to succeed at least once). *)
@ -345,8 +443,26 @@ val lookahead : 'a t -> 'a t
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val line : string t
(** Parse a line, '\n' excluded.
val lookahead_ignore : 'a t -> unit t
(** [lookahead_ignore p] tries to parse input with [p],
and succeeds if [p] succeeds. However it doesn't consume any input
and returns [()], so in effect its only use-case is to detect
whether [p] succeeds, e.g. in {!cond}.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val fix : ('a t -> 'a t) -> 'a t
(** Fixpoint combinator. *)
val line : slice t
(** Parse a line, ['\n'] excluded, and position the cursor after the ['\n'].
@since NEXT_RELEASE *)
val line_str : string t
(** [line_str] is [line >|= Slice.to_string].
It parses the next line and turns the slice into a string.
The state points to after the ['\n'] character.
@since NEXT_RELEASE *)
val each_line : 'a t -> 'a list t
@ -354,8 +470,90 @@ val each_line : 'a t -> 'a list t
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val fix : ('a t -> 'a t) -> 'a t
(** Fixpoint combinator. *)
val split_1 : on_char:char -> (slice * slice option) t
(** [split_1 ~on_char] looks for [on_char] in the input, and returns a
pair [sl1, sl2], where:
- [sl1] is the slice of the input the precedes the first occurrence
of [on_char], or the whole input if [on_char] cannot be found.
- [sl2] is the slice that comes after [on_char],
or [None] if [on_char] couldn't be found.
The parser is now positioned at the end of the input.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val split_list : on_char:char -> slice list t
(** [split_n ~on_char] splits the input on all occurrences of [on_char],
returning a list of slices.
A useful specialization of this is {!each_line}, which is
basically [split_n ~on_char:'\n' p].
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val split_list_at_most : on_char:char -> int -> slice list t
(** [split_list_at_most ~on_char n] applies [split_1 ~on_char] at most
[n] times, to get a list of [n+1] elements.
The last element might contain [on_char]. This is useful to limit the
amount of work done by {!split_list}.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val split_2 : on_char:char -> (slice * slice) t
(** [split_2 ~on_char] splits the input into exactly 2 fields,
and fails if the split yields less or more than 2 items.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val split_3 : on_char:char -> (slice * slice * slice) t
(** See {!split_2}
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val split_4 : on_char:char -> (slice * slice * slice * slice) t
(** See {!split_2}
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val each_split : on_char:char -> 'a t -> 'a list t
(** [split_list_map ~on_char p] uses [split_list ~on_char] to split
the input, then parses each chunk of the input thus obtained using [p].
The difference with [sep ~by:(char on_char) p] is that
[sep] calls [p] first, and only tries to find [on_char] after [p] returns.
While it is more flexible, this technique also means [p] has to be careful
not to consume [on_char] by error.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
val all : slice t
(** [all] returns all the unconsumed input as a slice, and consumes it.
Use {!Slice.to_string} to turn it into a string.
Note that [lookahead all] can be used to {i peek} at the rest of the input
without consuming anything.
@since NEXT_RELEASE *)
val all_str : string t
(** [all_str] accepts all the remaining chars and extracts them into a
string. Similar to {!rest_of_input} but with a string.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
(* TODO
val trim : slice t
(** [trim] is like {!all}, but removes whitespace on the left and right.
{b EXPERIMENTAL}
@since NEXT_RELEASE *)
*)
val memo : 'a t -> 'a t
(** Memoize the parser. [memo p] will behave like [p], but when called