diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index aaaae7f6..2fbdf451 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -167,7 +167,7 @@ module Error = struct Format.fprintf out "@[at line %d, char %d:@ %s@]" line col (self.msg()) end -type 'a or_error = ('a, Error.t) result +type +'a or_error = ('a, Error.t) result module Memo_tbl = Hashtbl.Make(struct type t = int * int (* id of parser, position *) @@ -186,7 +186,8 @@ end (** Purely functional state passed around *) type state = { str: string; (* the input *) - i: int; (* offset in [input.str] *) + i: int; (* offset in [str] *) + j: int; (* end pointer in [str], excluded. [len = j-i] *) memo : Memo_state.t option ref; (* Memoization table, if any *) } @@ -207,11 +208,12 @@ let state_of_string str = let s = { str; i=0; + j=String.length str; memo=ref None; } in s -let[@inline] is_done st = st.i >= String.length st.str +let[@inline] is_done st = st.i >= st.j let[@inline] cur st = st.str.[st.i] let pos_of_st_ st : position = {pos_buffer=st.str; pos_offset=st.i; pos_lc=None} @@ -328,7 +330,7 @@ let map2 f x y = pure f <*> x <*> y let map3 f x y z = pure f <*> x <*> y <*> z let junk_ (st:state) : state = - assert (st.i < String.length st.str); + assert (st.i < st.j); {st with i=st.i + 1} let eoi = { @@ -344,11 +346,61 @@ let eoi = { (Ok true) (parse_string (U.bool <* skip_white <* eoi) "true") *) +let with_pos p : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st' x -> + ok st' (x, pos_of_st_ st)) + ~err +} + +(* a slice is just a state, which makes {!recurse} quite easy. *) +type slice = state + +module Slice = struct + type t = slice + let length sl = sl.j - sl.i + let is_empty sl = sl.i = sl.j + let to_string sl = String.sub sl.str sl.i (length sl) +end + +let recurse slice p : _ t = { + run=fun _st ~ok ~err -> + (* make sure these states are related. all slices share the + same reference as the initial state they derive from. *) + assert CCShims_.Stdlib.(_st.memo == slice.memo); + p.run slice ~ok ~err +} + +let all = { + run=fun st ~ok ~err:_ -> + if is_done st then ok st st + else ( + let st_done = {st with i=st.j} in + ok st_done st + ) +} + +let all_str = all >|= Slice.to_string + +(*$= & ~printer:(errpp Q.Print.string) ~cmp:(erreq (=)) + (Ok "abcd") (parse_string all_str "abcd") + (Ok "cd") (parse_string (string "ab" *> all_str) "ab") +*) + +(*$= & ~printer:(errpp Q.Print.(pair string string)) ~cmp:(erreq (=)) + (Ok ("foobar", "")) (parse_string (both all_str all_str) "foobar") + *) + let fail msg : _ t = { run=fun st ~ok:_ ~err -> err (mk_error_ st (const_str_ msg)) } let failf msg = Printf.ksprintf fail msg +let fail_lazy msg = { + run=fun st ~ok:_ ~err -> + err (mk_error_ st msg) +} let parsing what p = { run=fun st ~ok ~err -> @@ -360,9 +412,10 @@ let parsing what p = { err {e with Error.msg}) } -let nop = { +let empty = { run=fun st ~ok ~err:_ -> ok st (); } +let nop = empty let any_char = { run=fun st ~ok ~err -> consume_ st ~ok ~err @@ -398,19 +451,35 @@ let char_if ?descr p = { ~err } -let chars_if p = { +let take_if p : slice t = { run=fun st ~ok ~err:_ -> - let i0 = st.i in - let i = ref i0 in + let i = ref st.i in while let st = {st with i = !i} in not (is_done st) && p (cur st) do incr i; done; - ok {st with i = !i} (String.sub st.str i0 (!i - i0)) + ok {st with i = !i} {st with j= !i} } +let take1_if ?descr p = + take_if p >>= fun sl -> + if Slice.is_empty sl then ( + let msg() = + let what = match descr with + | None -> "" + | Some d -> Printf.sprintf " for %s" d + in + Printf.sprintf "expected non-empty sequence of chars%s" what + in + fail_lazy msg + ) else ( + return sl + ) + +let chars_if p = take_if p >|= Slice.to_string + let chars1_if ?descr p = { run=fun st ~ok ~err -> (chars_if p).run st @@ -453,11 +522,11 @@ let chars_fold ~f acc0 = { | `Fail msg -> raise (Fold_fail (st,msg)) ) done; - ok {st with i= !i} !acc + ok {st with i= !i} (!acc, {st with j= !i}) with Fold_fail (st,msg) -> err (mk_error_ st (const_str_ msg)) } -let chars_fold_map ~f acc0 = { +let chars_fold_transduce ~f acc0 = { run=fun st ~ok ~err -> let i0 = st.i in let i = ref i0 in @@ -524,6 +593,22 @@ let try_or p1 ~f ~else_:p2 = { ~err:(fun _ -> p2.run st ~ok ~err) } +let try_or_l ?(msg="try_or_l ran out of options") ?else_ l : _ t = { + run=fun st ~ok ~err -> + let rec loop = function + | (test, p) :: tl -> + test.run st + ~ok:(fun _ _ -> p.run st ~ok ~err) (* commit *) + ~err:(fun _ -> loop tl) + | [] -> + begin match else_ with + | None -> err (mk_error_ st (const_str_ msg)) + | Some p -> p.run st ~ok ~err + end + in + loop l +} + let suspend f = { run=fun st ~ok ~err -> let p = f () in @@ -531,12 +616,12 @@ let suspend f = { } (* read [len] chars at once *) -let any_chars len : _ t = { +let take len : slice t = { run=fun st ~ok ~err -> - if st.i + len <= String.length st.str then ( - let s = String.sub st.str st.i len in + if st.i + len <= st.j then ( + let slice = {st with j = st.i + len} in let st = {st with i = st.i + len} in - ok st s + ok st slice ) else ( let msg() = Printf.sprintf "expected to be able to consume %d chars" len @@ -545,6 +630,8 @@ let any_chars len : _ t = { ) } +let any_chars len : _ t = take len >|= Slice.to_string + let exact s = { run=fun st ~ok ~err -> (* parse a string of length [String.length s] and compare with [s] *) @@ -697,58 +784,116 @@ let lookahead p : _ t = { ~err } +let lookahead_ignore p : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun _st _x -> ok st ()) + ~err +} + (*$= & ~printer:(errpp Q.Print.(string)) (Ok "abc") (parse_string (lookahead (string "ab") *> (string "abc")) "abcd") *) -let line : _ t = { - run=fun st ~ok ~err -> - if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI")) - else ( - match String.index_from st.str st.i '\n' with +(*$= + (Ok "1234") (parse_string line_str "1234\nyolo") + (Ok ("1234", "yolo")) (parse_string (line_str ||| line_str) "1234\nyolo\nswag") +*) + +let split_1 ~on_char : _ t = { + run=fun st ~ok ~err:_ -> + if st.i >= st.j then ( + ok st (st, None) + ) else ( + match String.index_from st.str st.i on_char with | j -> - let s = String.sub st.str st.i (j - st.i) in - ok {st with i=j+1} s + let x = {st with j} in + let y = {st with i=min st.j (j+1)} in + let st_done = {st with i=st.j} in (* empty *) + ok st_done (x, Some y) | exception Not_found -> - err (mk_error_ st (const_str_ "unterminated line")) + let st_done = {st with i=st.j} in (* empty *) + ok st_done (st, None) ) } -(*$= - (Ok "1234") (parse_string line "1234\nyolo") - (Ok ("1234", "yolo")) (parse_string (line ||| line) "1234\nyolo\nswag") +let split_list_at_most ~on_char n : slice list t = + let rec loop acc n = + if n <= 0 then return (List.rev acc) + else ( + try_or + eoi + ~f:(fun _ -> return (List.rev acc)) + ~else_:(parse_1 acc n) + ) + and parse_1 acc n = + split_1 ~on_char >>= fun (sl1, rest) -> + let acc = sl1 :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc (n-1)) + in + loop [] n + +(*$= & ~printer:(errpp Q.Print.(list string)) ~cmp:(erreq (=)) + (Ok ["a";"b";"c";"d,e,f"]) \ + (parse_string (split_list_at_most ~on_char:',' 3 >|= List.map Slice.to_string) "a,b,c,d,e,f") + (Ok ["a";"bc"]) \ + (parse_string (split_list_at_most ~on_char:',' 3 >|= List.map Slice.to_string) "a,bc") *) -(* parse a string [s] using [p_sub], then parse [s] using [p]. - The result is that of parsing [s] using [p], but the state is - the one after using [p_sub], and errors are translated back into the context - of [p_sub]. - This can be useful for example in [p_sub line some_line_parser]. *) -let parse_sub_ p_sub p : _ t = { - run=fun st0 ~ok ~err -> - let p = p <* eoi in (* make sure [p] reads all *) - p_sub.run st0 - ~ok:(fun st1 s -> - p.run (state_of_string s) - ~ok:(fun _ r -> ok st1 r) - ~err:(fun e -> - let pos = e.pos in - let pos = { - pos_buffer=pos.pos_buffer; - pos_offset=pos.pos_offset + st0.i; - pos_lc=None; - } in - err {e with pos})) - ~err -} +let split_list ~on_char : _ t = + split_list_at_most ~on_char max_int + +let split_2 ~on_char : _ t = + split_list_at_most ~on_char 2 >>= function + | [a; b] -> return (a,b) + | _ -> fail "split_2: expected 2 fields exactly" + +let split_3 ~on_char : _ t = + split_list_at_most ~on_char 3 >>= function + | [a; b; c] -> return (a,b,c) + | _ -> fail "split_3: expected 3 fields exactly" + +let split_4 ~on_char : _ t = + split_list_at_most ~on_char 4 >>= function + | [a; b; c; d] -> return (a,b,c,d) + | _ -> fail "split_4: expected 4 fields exactly" + +let split_list ~on_char : slice list t = + let rec loop acc = + try_or + eoi + ~f:(fun _ -> return (List.rev acc)) + ~else_:(parse_1 acc) + and parse_1 acc = + split_1 ~on_char >>= fun (sl1, rest) -> + let acc = sl1 :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc) + in + loop [] + +let each_split ~on_char p : 'a list t = + let rec loop acc = + split_1 ~on_char >>= fun (sl1, rest) -> + (* parse [sl1] with [p] *) + recurse sl1 p >>= fun x -> + let acc = x :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc) + in + loop [] + +let line : slice t = + split_1 ~on_char:'\n' >|= fst + +let line_str = line >|= Slice.to_string let each_line p : _ t = - fix - (fun self -> - try_or eoi - ~f:(fun _ -> pure []) - (parse_sub_ line p >>= fun x -> - self >|= fun tl -> x :: tl)) + each_split ~on_char:'\n' p (*$= & ~printer:(errpp Q.Print.(list @@ list int)) (Ok ([[1;1];[2;2];[3;3]])) \ diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index e20b4914..c3a035c6 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -8,6 +8,8 @@ {2 A few examples} + Some more advanced example(s) can be found in the [/examples] directory. + {4 Parse a tree} {[ @@ -104,7 +106,7 @@ module Error : sig (** Pretty prints the error *) end -type 'a or_error = ('a, Error.t) result +type +'a or_error = ('a, Error.t) result (** ['a or_error] is either [Ok x] for some result [x : 'a], or an error {!Error.t}. @@ -146,21 +148,45 @@ val ap : ('a -> 'b) t -> 'a t -> 'b t (** Applicative. @since NEXT_RELEASE *) +val eoi : unit t +(** Expect the end of input, fails otherwise. *) + +val nop : unit t +(** Succeed with [()]. *) + +val empty : unit t +(** Succeed with [()], same as {!nop}. + @since NEXT_RELEASE *) + val fail : string -> 'a t (** [fail msg] fails with the given message. It can trigger a backtrack. *) val failf: ('a, unit, string, 'b t) format4 -> 'a (** [Format.sprintf] version of {!fail}. *) +val fail_lazy : (unit -> string) -> 'a t +(** Like {!fail}, but only produce an error message on demand. + @since NEXT_RELEASE *) + val parsing : string -> 'a t -> 'a t (** [parsing s p] behaves the same as [p], with the information that - we are parsing [s], if [p] fails. *) + we are parsing [s], if [p] fails. + The message [s] is added to the error, it does not replace it, + not does the location change (the error still points to + the same location as in [p]). *) -val eoi : unit t -(** Expect the end of input, fails otherwise. *) +val set_error_message : string -> 'a t -> 'a t +(** [set_error_message msg p] behaves like [p], but if [p] fails, + [set_error_message msg p] fails with [msg] instead and at the current + position. The internal error message of [p] is just discarded. + @since NEXT_RELEASE *) -val nop : unit t -(** Succeed with [()]. *) +val with_pos : 'a t -> ('a * position) t +(** [with_pos p] behaves like [p], but returns the (starting) position + along with [p]'s result. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) val any_char : char t (** [any_char] parses any character. @@ -169,27 +195,52 @@ val any_char : char t val any_chars : int -> string t (** [any_chars len] parses exactly [len] characters from the input. + Fails if the input doesn't contain at least [len] chars. @since NEXT_RELEASE *) val char : char -> char t (** [char c] parses the character [c] and nothing else. *) -val char_if : ?descr:string -> (char -> bool) -> char t -(** [char_if f] parses a character [c] if [f c = true]. - @param descr describes what kind of character was expected *) +type slice +(** A slice of the input, as returned by some combinators such + as {!split_1} or {split_n}. -val chars_if : (char -> bool) -> string t -(** [chars_if f] parses a string of chars that satisfy [f]. *) + {b EXPERIMENTAL} + @since NEXT_RELEASE *) -val chars1_if : ?descr:string -> (char -> bool) -> string t -(** Like {!chars_if}, but only non-empty strings. - @param descr describes what kind of character was expected *) +(** Functions on slices. + @since NEXT_RELEASE *) +module Slice : sig + type t = slice + + val is_empty : t -> bool + (** Is the slice empty? *) + + val length : t -> int + (** Length of the slice *) + + val to_string : t -> string + (** Convert the slice into a string. + Linear time and memory in [length slice] *) +end + +val recurse : slice -> 'a t -> 'a t +(** [recurse slice p] parses the [slice] + (most likely obtained via another combinator, such as {!split_1} + or {!split_n}), using [p]. + + The slice contains a position which is used to relocate error + messages to their position in the whole input, not just relative to + the slice. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) val chars_fold : f:('acc -> char -> [`Continue of 'acc | `Consume_and_stop | `Stop | `Fail of string]) -> 'acc -> - 'acc t + ('acc * slice) t (** [chars_fold f acc0] folds over characters of the input. Each char [c] is passed, along with the current accumulator, to [f]; [f] can either: @@ -204,24 +255,54 @@ val chars_fold : This is a generalization of of {!chars_if} that allows one to transform characters on the fly, skip some, handle escape sequences, etc. + It can also be useful as a base component for a lexer. @since NEXT_RELEASE *) -val chars_fold_map : +val chars_fold_transduce : f:('acc -> char -> - [`Continue of 'acc | `Yield of 'acc * char + [ `Continue of 'acc | `Yield of 'acc * char | `Consume_and_stop | `Stop | `Fail of string]) -> 'acc -> ('acc * string) t (** Same as {!char_fold} but with the following differences: - - returns a string along with the accumulator. The string is built from - characters returned by [`Yield]. + - returns a string along with the accumulator, rather than the slice + of all the characters accepted by [`Continue _]. + The string is built from characters returned by [`Yield]. - new case [`Yield (acc, c)] adds [c] to the returned string and continues parsing with [acc]. @since NEXT_RELEASE *) +val take : int -> slice t +(** [slice_of_len len] parses exactly [len] characters from the input. + Fails if the input doesn't contain at least [len] chars. + @since NEXT_RELEASE *) + +val take_if : (char -> bool) -> slice t +(** [take_if f] takes characters as long as they satisfy the predicate [f]. + @since NEXT_RELEASE *) + +val take1_if : ?descr:string -> (char -> bool) -> slice t +(** [take1_if f] takes characters as long as they satisfy the predicate [f]. + Fails if no character satisfies [f]. + @since NEXT_RELEASE *) + +val char_if : ?descr:string -> (char -> bool) -> char t +(** [char_if f] parses a character [c] if [f c = true]. + Fails if the next char does not satisfy [f]. + @param descr describes what kind of character was expected *) + +val chars_if : (char -> bool) -> string t +(** [chars_if f] parses a string of chars that satisfy [f]. + Cannot fail. *) + +val chars1_if : ?descr:string -> (char -> bool) -> string t +(** Like {!chars_if}, but only non-empty strings. + Fails if the string is empty. + @param descr describes what kind of character was expected *) + val endline : char t (** Parse '\n'. *) @@ -272,7 +353,9 @@ val many : 'a t -> 'a list t val optional : _ t -> unit t (** [optional p] tries to parse [p], and return [()] whether it - succeeded or failed. Cannot fail. + succeeded or failed. Cannot fail itself. + It consumes input if [p] succeeded (as much as [p] consumed), but + consumes not input if [p] failed. @since NEXT_RELEASE *) val try_ : 'a t -> 'a t @@ -307,6 +390,27 @@ val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t @since NEXT_RELEASE *) +val try_or_l : + ?msg:string -> + ?else_:'a t -> + (unit t * 'a t) list -> + 'a t +(** [try_or_l ?else_ l] tries each pair [(test, p)] in order. + If the n-th [test] succeeds, then [try_or_l l] behaves like n-th [p], + whether [p] fails or not. + If they all fail, and [else_] is defined, then it behaves like [else_]. + If all fail, and [else_] is [None], then it fails as well. + + This is a performance optimization compared to {!(<|>)}. We commit to a + branch if the test succeeds, without backtracking at all. + + See {!lookahead_ignore} for a convenient way of writing the test conditions. + + @param msg error message if all options fail + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + val or_ : 'a t -> 'a t -> 'a t (** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2] from the same position. @@ -316,12 +420,6 @@ val both : 'a t -> 'b t -> ('a * 'b) t (** [both a b] parses [a], then [b], then returns the pair of their results. @since NEXT_RELEASE *) -val set_error_message : string -> 'a t -> 'a t -(** [set_error_message msg p] behaves like [p], but if [p] fails, - [set_error_message msg p] fails with [msg] instead. - @since NEXT_RELEASE -*) - val many1 : 'a t -> 'a list t (** [many1 p] is like [many p] excepts it fails if the list is empty (i.e. it needs [p] to succeed at least once). *) @@ -345,8 +443,26 @@ val lookahead : 'a t -> 'a t {b EXPERIMENTAL} @since NEXT_RELEASE *) -val line : string t -(** Parse a line, '\n' excluded. +val lookahead_ignore : 'a t -> unit t +(** [lookahead_ignore p] tries to parse input with [p], + and succeeds if [p] succeeds. However it doesn't consume any input + and returns [()], so in effect its only use-case is to detect + whether [p] succeeds, e.g. in {!cond}. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val fix : ('a t -> 'a t) -> 'a t +(** Fixpoint combinator. *) + +val line : slice t +(** Parse a line, ['\n'] excluded, and position the cursor after the ['\n']. + @since NEXT_RELEASE *) + +val line_str : string t +(** [line_str] is [line >|= Slice.to_string]. + It parses the next line and turns the slice into a string. + The state points to after the ['\n'] character. @since NEXT_RELEASE *) val each_line : 'a t -> 'a list t @@ -354,8 +470,90 @@ val each_line : 'a t -> 'a list t {b EXPERIMENTAL} @since NEXT_RELEASE *) -val fix : ('a t -> 'a t) -> 'a t -(** Fixpoint combinator. *) +val split_1 : on_char:char -> (slice * slice option) t +(** [split_1 ~on_char] looks for [on_char] in the input, and returns a + pair [sl1, sl2], where: + + - [sl1] is the slice of the input the precedes the first occurrence + of [on_char], or the whole input if [on_char] cannot be found. + - [sl2] is the slice that comes after [on_char], + or [None] if [on_char] couldn't be found. + + The parser is now positioned at the end of the input. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_list : on_char:char -> slice list t +(** [split_n ~on_char] splits the input on all occurrences of [on_char], + returning a list of slices. + + A useful specialization of this is {!each_line}, which is + basically [split_n ~on_char:'\n' p]. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_list_at_most : on_char:char -> int -> slice list t +(** [split_list_at_most ~on_char n] applies [split_1 ~on_char] at most + [n] times, to get a list of [n+1] elements. + The last element might contain [on_char]. This is useful to limit the + amount of work done by {!split_list}. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + + +val split_2 : on_char:char -> (slice * slice) t +(** [split_2 ~on_char] splits the input into exactly 2 fields, + and fails if the split yields less or more than 2 items. + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_3 : on_char:char -> (slice * slice * slice) t +(** See {!split_2} + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_4 : on_char:char -> (slice * slice * slice * slice) t +(** See {!split_2} + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val each_split : on_char:char -> 'a t -> 'a list t +(** [split_list_map ~on_char p] uses [split_list ~on_char] to split + the input, then parses each chunk of the input thus obtained using [p]. + + The difference with [sep ~by:(char on_char) p] is that + [sep] calls [p] first, and only tries to find [on_char] after [p] returns. + While it is more flexible, this technique also means [p] has to be careful + not to consume [on_char] by error. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val all : slice t +(** [all] returns all the unconsumed input as a slice, and consumes it. + Use {!Slice.to_string} to turn it into a string. + + Note that [lookahead all] can be used to {i peek} at the rest of the input + without consuming anything. + + @since NEXT_RELEASE *) + +val all_str : string t +(** [all_str] accepts all the remaining chars and extracts them into a + string. Similar to {!rest_of_input} but with a string. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +(* TODO +val trim : slice t +(** [trim] is like {!all}, but removes whitespace on the left and right. + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + *) val memo : 'a t -> 'a t (** Memoize the parser. [memo p] will behave like [p], but when called