diff --git a/_oasis b/_oasis index 2f10d395..4ae7b35f 100644 --- a/_oasis +++ b/_oasis @@ -41,7 +41,7 @@ Library "containers" Modules: CCVector, CCHeap, CCList, CCOpt, CCPair, CCFun, CCHash, CCInt, CCBool, CCFloat, CCArray, CCRef, CCSet, CCOrd, CCRandom, CCString, CCHashtbl, CCMap, CCFormat, CCIO, - CCInt64, CCChar, CCResult, Containers + CCInt64, CCChar, CCResult, CCParse, Containers BuildDepends: bytes, result # BuildDepends: bytes, bisect_ppx diff --git a/doc/intro.txt b/doc/intro.txt index 99a13099..253dc5d9 100644 --- a/doc/intro.txt +++ b/doc/intro.txt @@ -42,6 +42,7 @@ CCMap CCOpt CCOrd CCPair +CCParse CCRandom CCRef CCResult diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml new file mode 100644 index 00000000..a273c200 --- /dev/null +++ b/src/core/CCParse.ml @@ -0,0 +1,414 @@ + +(* This file is free software. See file "license" for more details. *) + +(** {1 Very Simple Parser Combinators} *) + +type 'a or_error = ('a, string) Result.result + +type line_num = int +type col_num = int + +module MemoTbl = struct + module H = Hashtbl.Make(struct + type t = int * int (* id of parser, position *) + let equal ((a,b):t)(c,d) = a=c && b=d + let hash = Hashtbl.hash + end) + + (* table of closures, used to implement universal type *) + type t = (unit -> unit) H.t lazy_t + + let create n = lazy (H.create n) + + (* unique ID for each parser *) + let id_ = ref 0 + + type 'a res = + | Fail of exn + | Ok of 'a +end + +type position = int * int * int (* pos, line, column *) + +type parse_branch = (line_num * col_num * string option) list + +type state = { + str: string; (* the input *) + mutable i: int; (* offset *) + mutable lnum : line_num; (* Line number *) + mutable cnum : col_num; (* Column number *) + mutable branch: parse_branch; + memo : MemoTbl.t; (* Memoization table, if any *) +} + +exception ParseError of parse_branch * (unit -> string) + +let rec string_of_branch l = + let pp_s () = function + | None -> "" + | Some s -> Format.sprintf "while parsing %s, " s + in + match l with + | [] -> "" + | [l,c,s] -> + Format.sprintf "@[%aat line %d, col %d@]" pp_s s l c + | (l,c,s) :: tail -> + Format.sprintf "@[%aat line %d, col %d@]@,%s" pp_s s l c (string_of_branch tail) + +let () = Printexc.register_printer + (function + | ParseError (b,msg) -> + Some (Format.sprintf "@[%s@ %s@]" (msg()) (string_of_branch b)) + | _ -> None) + +let const_ x () = x + +let state_of_string str = + let s = { + str; + i=0; + lnum=1; + cnum=1; + branch=[]; + memo=MemoTbl.create 32; + } in + s + +let is_done st = st.i = String.length st.str +let cur st = st.str.[st.i] + +let fail_ ~err st msg = + let b = (st.lnum, st.cnum, None) :: st.branch in + err (ParseError (b, msg)) + +let next st ~ok ~err = + if st.i = String.length st.str + then fail_ st ~err (const_ "unexpected end of input") + else ( + let c = st.str.[st.i] in + st.i <- st.i + 1; + if c='\n' + then (st.lnum <- st.lnum + 1; st.cnum <- 1) + else st.cnum <- st.cnum + 1; + ok c + ) + +let pos st = st.i, st.lnum, st.cnum + +let backtrack st (i',l',c') = + assert (0 <= i' && i' <= st.i); + st.i <- i'; + st.lnum <- l'; + st.cnum <- c'; + () + +type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit + +let return : 'a -> 'a t = fun x _st ~ok ~err:_ -> ok x +let pure = return +let (>|=) : 'a t -> ('a -> 'b) -> 'b t + = fun p f st ~ok ~err -> p st ~err ~ok:(fun x -> ok (f x)) +let (>>=) : 'a t -> ('a -> 'b t) -> 'b t + = fun p f st ~ok ~err -> p st ~err ~ok:(fun x -> f x st ~err ~ok) +let (<*>) : ('a -> 'b) t -> 'a t -> 'b t + = fun f x st ~ok ~err -> + f st ~err ~ok:(fun f' -> x st ~err ~ok:(fun x' -> ok (f' x'))) +let (<* ) : 'a t -> _ t -> 'a t + = fun x y st ~ok ~err -> + x st ~err ~ok:(fun res -> y st ~err ~ok:(fun _ -> ok res)) +let ( *>) : _ t -> 'a t -> 'a t + = fun x y st ~ok ~err -> + x st ~err ~ok:(fun _ -> y st ~err ~ok) + +let map f x = x >|= f +let map2 f x y = pure f <*> x <*> y +let map3 f x y z = pure f <*> x <*> y <*> z + +let junk_ st = next st ~err:(fun _ -> assert false) ~ok:ignore + +let eoi st ~ok ~err = + if is_done st + then ok () + else fail_ ~err st (const_ "expected EOI") + +let fail msg st ~ok:_ ~err = fail_ ~err st (const_ msg) +let failf msg = Printf.ksprintf fail msg + +let parsing s p st ~ok ~err = + st.branch <- (st.lnum, st.cnum, Some s) :: st.branch; + p st + ~ok:(fun x -> st.branch <- List.tl st.branch; ok x) + ~err:(fun e -> st.branch <- List.tl st.branch; err e) + +let nop _ ~ok ~err:_ = ok() + +let char c = + let msg = Printf.sprintf "expected '%c'" c in + fun st ~ok ~err -> + next st ~err + ~ok:(fun c' -> if c=c' then ok c else fail_ ~err st (const_ msg)) + +let char_if p st ~ok ~err = + next st ~err + ~ok:(fun c -> + if p c then ok c + else fail_ ~err st (fun () -> Printf.sprintf "unexpected char '%c'" c) + ) + +let chars_if p st ~ok ~err:_ = + let i = st.i in + let len = ref 0 in + while not (is_done st) && p (cur st) do junk_ st; incr len done; + ok (String.sub st.str i !len) + +let chars1_if p st ~ok ~err = + chars_if p st ~err + ~ok:(fun s -> + if s = "" + then fail_ ~err st (const_ "unexpected sequence of chars") + else ok s) + +let rec skip_chars p st ~ok ~err = + if not (is_done st) && p (cur st) then ( + junk_ st; + skip_chars p st ~ok ~err + ) else ok() + +let is_alpha = function + | 'a' .. 'z' | 'A' .. 'Z' -> true + | _ -> false +let is_num = function '0' .. '9' -> true | _ -> false +let is_alpha_num = function + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' -> true + | _ -> false +let is_space = function ' ' | '\t' -> true | _ -> false +let is_white = function ' ' | '\t' | '\n' -> true | _ -> false + +let space = char_if is_space +let white = char_if is_white + +let endline st ~ok ~err = + next st ~err + ~ok:(function + | '\n' as c -> ok c + | _ -> fail_ ~err st (const_ "expected end-of-line")) + +let skip_space = skip_chars is_space +let skip_white = skip_chars is_white + +let (<|>) : 'a t -> 'a t -> 'a t + = fun x y st ~ok ~err -> + let i = st.i in + x st ~ok + ~err:(fun e -> + let j = st.i in + if i=j then y st ~ok ~err (* try [y] *) + else err e (* fail *) + ) + +let try_ : 'a t -> 'a t + = fun p st ~ok ~err -> + let i = pos st in + p st ~ok + ~err:(fun e -> + backtrack st i; + err e) + +let suspend f st ~ok ~err = f () st ~ok ~err + +let () : 'a t -> string -> 'a t + = fun x msg st ~ok ~err -> + let i = st.i in + x st ~ok + ~err:(fun e -> + if st.i = i + then fail_ ~err st (fun () -> msg) + else err e) + +let string s st ~ok ~err = + let rec check i = + if i = String.length s then ok s + else + next st ~err + ~ok:(fun c -> + if c = s.[i] + then check (i+1) + else fail_ ~err st (fun () -> Printf.sprintf "expected \"%s\"" s)) + in + check 0 + +let rec many_rec : 'a t -> 'a list -> 'a list t = fun p acc st ~ok ~err -> + if is_done st then ok(List.rev acc) + else + p st ~err + ~ok:(fun x -> + let i = pos st in + many_rec p (x :: acc) st ~ok + ~err:(fun _ -> + backtrack st i; + ok(List.rev acc)) + ) + +let many : 'a t -> 'a list t + = fun p st ~ok ~err -> many_rec p [] st ~ok ~err + +let many1 : 'a t -> 'a list t = + fun p st ~ok ~err -> + p st ~err ~ok:(fun x -> many_rec p [x] st ~err ~ok) + +let rec skip p st ~ok ~err = + let i = pos st in + p st + ~ok:(fun _ -> skip p st ~ok ~err) + ~err:(fun _ -> + backtrack st i; + ok() + ) + +(* by (sep1 ~by p) *) +let rec sep_rec ~by p = try_ by *> sep1 ~by p + +and sep1 ~by p = + p >>= fun x -> + (sep_rec ~by p >|= fun tl -> x::tl) + <|> return [x] + +let sep ~by p = + (try_ p >>= fun x -> + (sep_rec ~by p >|= fun tl -> x::tl) + <|> return [x]) + <|> return [] + +let fix f = + let rec p st ~ok ~err = f p st ~ok ~err in + p + +let memo (type a) (p:a t):a t = + let id = !MemoTbl.id_ in + incr MemoTbl.id_; + let r = ref None in (* used for universal encoding *) + fun st ~ok ~err -> + let i = st.i in + let (lazy tbl) = st.memo in + try + let f = MemoTbl.H.find tbl (i,id) in + (* extract hidden value *) + r := None; + f (); + begin match !r with + | None -> assert false + | Some (MemoTbl.Ok x) -> ok x + | Some (MemoTbl.Fail e) -> err e + end + with Not_found -> + (* parse, and save *) + p st + ~err:(fun e -> + MemoTbl.H.replace tbl (i,id) (fun () -> r := Some (MemoTbl.Fail e)); + err e) + ~ok:(fun x -> + MemoTbl.H.replace tbl (i,id) (fun () -> r := Some (MemoTbl.Ok x)); + ok x) + +let fix_memo f = + let rec p = + let p' = lazy (memo p) in + fun st ~ok ~err -> f (Lazy.force p') st ~ok ~err + in + p + +let get_lnum = fun st ~ok ~err:_ -> ok st.lnum +let get_cnum = fun st ~ok ~err:_ -> ok st.cnum +let get_pos = fun st ~ok ~err:_ -> ok (st.lnum, st.cnum) + +let parse_exn p st = + let res = ref None in + p st ~ok:(fun x -> res := Some x) ~err:(fun e -> raise e); + match !res with + | None -> assert false + | Some x -> x + +let exn_to_err e =Result.Error (Printexc.to_string e) + +let parse p st = + try Result.Ok (parse_exn p st) + with e -> exn_to_err e + +let parse_string_exn p s = parse_exn p (state_of_string s) + +let parse_string p s = parse p (state_of_string s) + +let read_all_ ic = + let buf = Buffer.create 1024 in + begin + try + while true do + let line = input_line ic in + Buffer.add_string buf line; + Buffer.add_char buf '\n'; + done; + assert false + with End_of_file -> () + end; + Buffer.contents buf + +let parse_file_exn p file = + let ic = open_in file in + let st = state_of_string (read_all_ ic) in + try + let res = parse_exn p st in + close_in ic; + res + with e -> + close_in_noerr ic; + raise e + +let parse_file p file = + try Result.Ok (parse_file_exn p file) + with e -> exn_to_err e + +module Infix = struct + let (>|=) = (>|=) + let (>>=) = (>>=) + let (<*>) = (<*>) + let (<* ) = (<* ) + let ( *>) = ( *>) + let (<|>) = (<|>) + let () = () +end + +module U = struct + let sep_ = sep + + let list ?(start="[") ?(stop="]") ?(sep=";") p = + string start *> skip_white *> + sep_ ~by:(skip_white *> string sep *> skip_white) p <* + skip_white <* string stop + + let int = + chars1_if (fun c -> is_num c || c='-') + >>= fun s -> + try return (int_of_string s) + with Failure _ -> fail "expected an int" + + let prepend_str c s = String.make 1 c ^ s + + let word = + map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) + + let pair ?(start="(") ?(stop=")") ?(sep=",") p1 p2 = + string start *> skip_white *> + p1 >>= fun x1 -> + skip_white *> string sep *> skip_white *> + p2 >>= fun x2 -> + string stop *> return (x1,x2) + + let triple ?(start="(") ?(stop=")") ?(sep=",") p1 p2 p3 = + string start *> skip_white *> + p1 >>= fun x1 -> + skip_white *> string sep *> skip_white *> + p2 >>= fun x2 -> + skip_white *> string sep *> skip_white *> + p3 >>= fun x3 -> + string stop *> return (x1,x2,x3) +end diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli new file mode 100644 index 00000000..5713e15c --- /dev/null +++ b/src/core/CCParse.mli @@ -0,0 +1,367 @@ + +(* This file is free software. See file "license" for more details. *) + +(** {1 Very Simple Parser Combinators} + +{[ +open CCParse;; + +type tree = L of int | N of tree * tree;; + +let mk_leaf x = L x +let mk_node x y = N(x,y) + +let ptree = fix @@ fun self -> + skip_space *> + ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + <|> + (U.int >|= mk_leaf) ) +;; + +parse_string_exn ptree "(1 (2 3))" ;; +parse_string_exn ptree "((1 2) (3 (4 5)))" ;; + +]} + +{6 Parse a list of words} + +{[ +open Containers_string.Parse;; +let p = U.list ~sep:"," U.word;; +parse_string_exn p "[abc , de, hello ,world ]";; +]} + +{6 Stress Test} +This makes a list of 100_000 integers, prints it and parses it back. + +{[ +let p = CCParse.(U.list ~sep:"," U.int);; + +let l = CCList.(1 -- 100_000);; +let l_printed = + CCFormat.to_string (CCList.print ~sep:"," ~start:"[" ~stop:"]" CCInt.print) l;; + +let l' = CCParse.parse_string_exn p l_printed;; + +assert (l=l');; +]} + +*) + +(*$inject + module T = struct + type tree = L of int | N of tree * tree + end + open T + open Result + + let mk_leaf x = L x + let mk_node x y = N(x,y) + + let ptree = fix @@ fun self -> + skip_space *> + ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + <|> + (U.int >|= mk_leaf) ) + + let ptree' = fix_memo @@ fun self -> + skip_space *> + ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + <|> + (U.int >|= mk_leaf) ) + + let rec pptree = function + | N (a,b) -> Printf.sprintf "N (%s, %s)" (pptree a) (pptree b) + | L x -> Printf.sprintf "L %d" x + + let errpptree = function + | Ok x -> "Ok " ^ pptree x + | Error s -> "Error " ^ s +*) + +(*$= & ~printer:errpptree + (Ok (N (L 1, N (L 2, L 3)))) \ + (parse_string ptree "(1 (2 3))" ) + (Ok (N (N (L 1, L 2), N (L 3, N (L 4, L 5))))) \ + (parse_string ptree "((1 2) (3 (4 5)))" ) + (Ok (N (L 1, N (L 2, L 3)))) \ + (parse_string ptree' "(1 (2 3))" ) + (Ok (N (N (L 1, L 2), N (L 3, N (L 4, L 5))))) \ + (parse_string ptree' "((1 2) (3 (4 5)))" ) +*) + +(*$R + let p = U.list ~sep:"," U.word in + let printer = function + | Ok l -> "Ok " ^ CCFormat.(to_string (list string)) l + | Error s -> "Error " ^ s + in + assert_equal ~printer + (Ok ["abc"; "de"; "hello"; "world"]) + (parse_string p "[abc , de, hello ,world ]"); + *) + +(*$R + let test n = + let p = CCParse.(U.list ~sep:"," U.int) in + + let l = CCList.(1 -- n) in + let l_printed = + CCFormat.(to_string (list ~start:"[" ~stop:"]" ~sep:"," int)) l in + + let l' = CCParse.parse_string_exn p l_printed in + + assert_equal ~printer:Q.Print.(list int) l l' + in + test 100_000; + test 400_000; + +*) + +type 'a or_error = ('a, string) Result.result + +type line_num = int +type col_num = int + +type parse_branch + +val string_of_branch : parse_branch -> string + +exception ParseError of parse_branch * (unit -> string) +(** parsing branch * message *) + +(** {2 Input} *) + +type position + +type state + +val state_of_string : string -> state + +(** {2 Combinators} *) + +type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit +(** Takes the input and two continuations: + {ul + {- [ok] to call with the result when it's done} + {- [err] to call when the parser met an error} + } + @raise ParseError in case of failure *) + +val return : 'a -> 'a t +(** Always succeeds, without consuming its input *) + +val pure : 'a -> 'a t +(** Synonym to {!return} *) + +val (>|=) : 'a t -> ('a -> 'b) -> 'b t +(** Map *) + +val map : ('a -> 'b) -> 'a t -> 'b t + +val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t + +val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t + +val (>>=) : 'a t -> ('a -> 'b t) -> 'b t +(** Monadic bind *) + +val (<*>) : ('a -> 'b) t -> 'a t -> 'b t +(** Applicative *) + +val (<* ) : 'a t -> _ t -> 'a t +(** [a <* b] parses [a] into [x], parses [b] and ignores its result, + and returns [x] *) + +val ( *>) : _ t -> 'a t -> 'a t +(** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The + results of [a] is ignored. *) + +val fail : string -> 'a t +(** [fail msg] fails with the given message. It can trigger a backtrack *) + +val failf: ('a, unit, string, 'b t) format4 -> 'a +(** [Format.sprintf] version of {!fail} *) + +val parsing : string -> 'a t -> 'a t +(** [parsing s p] behaves the same as [p], with the information that + we are parsing [s], if [p] fails *) + +val eoi : unit t +(** Expect the end of input, fails otherwise *) + +val nop : unit t +(** Succeed with [()] *) + +val char : char -> char t +(** [char c] parses the char [c] and nothing else *) + +val char_if : (char -> bool) -> char t +(** [char_if f] parses a character [c] if [f c = true] *) + +val chars_if : (char -> bool) -> string t +(** [chars_if f] parses a string of chars that satisfy [f] *) + +val chars1_if : (char -> bool) -> string t +(** Same as {!chars_if}, but only non-empty strings *) + +val endline : char t +(** Parses '\n' *) + +val space : char t +(** Tab or space *) + +val white : char t +(** Tab or space or newline *) + +val skip_chars : (char -> bool) -> unit t +(** Skip 0 or more chars satisfying the predicate *) + +val skip_space : unit t +(** Skip ' ' and '\t' *) + +val skip_white : unit t +(** Skip ' ' and '\t' and '\n' *) + +val is_alpha : char -> bool +(** Is the char a letter? *) + +val is_num : char -> bool +(** Is the char a digit? *) + +val is_alpha_num : char -> bool + +val is_space : char -> bool +(** True on ' ' and '\t' *) + +val is_white : char -> bool +(** True on ' ' and '\t' and '\n' *) + +val (<|>) : 'a t -> 'a t -> 'a t +(** [a <|> b] tries to parse [a], and if [a] fails without + consuming any input, backtracks and tries + to parse [b], otherwise it fails as [a]. + See {!try_} to ensure [a] does not consume anything (but it is best + to avoid wrapping large parsers with {!try_}) *) + +val () : 'a t -> string -> 'a t +(** [a msg] behaves like [a], but if [a] fails without + consuming any input, it fails with [msg] + instead. Useful as the last choice in a series of [<|>]: + [a <|> b <|> c "expected a|b|c"] *) + +val try_ : 'a t -> 'a t +(** [try_ p] tries to parse like [p], but backtracks if [p] fails. + Useful in combination with [<|>] *) + +val suspend : (unit -> 'a t) -> 'a t +(** [suspend f] is the same as [f ()], but evaluates [f ()] only + when needed *) + +val string : string -> string t +(** [string s] parses exactly the string [s], and nothing else *) + +val many : 'a t -> 'a list t +(** [many p] parses a list of [p], eagerly (as long as possible) *) + +val many1 : 'a t -> 'a list t +(** parses a non empty list *) + +val skip : _ t -> unit t +(** [skip p] parses zero or more times [p] and ignores its result *) + +val sep : by:_ t -> 'a t -> 'a list t +(** [sep ~by p] parses a list of [p] separated by [by] *) + +val sep1 : by:_ t -> 'a t -> 'a list t +(** [sep1 ~by p] parses a non empty list of [p], separated by [by] *) + + +val fix : ('a t -> 'a t) -> 'a t +(** Fixpoint combinator *) + +val memo : 'a t -> 'a t +(** Memoize the parser. [memo p] will behave like [p], but when called + in a state (read: position in input) it has already processed, [memo p] + returns a result directly. The implementation uses an underlying + hashtable. + This can be costly in memory, but improve the run time a lot if there + is a lot of backtracking involving [p]. + + This function is not thread-safe. *) + +val fix_memo : ('a t -> 'a t) -> 'a t +(** Same as {!fix}, but the fixpoint is memoized. *) + +val get_lnum : int t +(** Reflects the current line number *) + +val get_cnum : int t +(** Reflects the current column number *) + +val get_pos : (int * int) t +(** Reflects the current (line, column) numbers *) + +(** {2 Parse} + + Those functions have a label [~p] on the parser, since 0.14. +*) + +val parse : 'a t -> state -> 'a or_error +(** [parse p st] applies [p] on the input, and returns [Ok x] if + [p] succeeds with [x], or [Error s] otherwise *) + +val parse_exn : 'a t -> state -> 'a +(** Unsafe version of {!parse} + @raise ParseError if it fails *) + +val parse_string : 'a t -> string -> 'a or_error +(** Specialization of {!parse} for string inputs *) + +val parse_string_exn : 'a t -> string -> 'a +(** @raise ParseError if it fails *) + +val parse_file : 'a t -> string -> 'a or_error +(** [parse_file p file] parses [file] with [p] by opening the file + and reading it whole. *) + +val parse_file_exn : 'a t -> string -> 'a +(** @raise ParseError if it fails *) + +(** {2 Infix} *) + +module Infix : sig + val (>|=) : 'a t -> ('a -> 'b) -> 'b t + val (>>=) : 'a t -> ('a -> 'b t) -> 'b t + val (<*>) : ('a -> 'b) t -> 'a t -> 'b t + val (<* ) : 'a t -> _ t -> 'a t + val ( *>) : _ t -> 'a t -> 'a t + val (<|>) : 'a t -> 'a t -> 'a t + val () : 'a t -> string -> 'a t +end + +(** {2 Utils} + + This is useful to parse OCaml-like values in a simple way. *) + +module U : sig + val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t + (** [list p] parses a list of [p], with the OCaml conventions for + start token "[", stop token "]" and separator ";". + Whitespace between items are skipped *) + + val int : int t + + val word : string t + (** non empty string of alpha num, start with alpha *) + + val pair : ?start:string -> ?stop:string -> ?sep:string -> + 'a t -> 'b t -> ('a * 'b) t + (** Parse a pair using OCaml whitespace conventions. + The default is "(a, b)". *) + + val triple : ?start:string -> ?stop:string -> ?sep:string -> + 'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t + (** Parse a triple using OCaml whitespace conventions. + The default is "(a, b, c)". *) +end