From 7318162c5552d5bdd3cdb2f29c3eaa1d5a5cef3f Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Tue, 4 May 2021 17:48:49 -0400 Subject: [PATCH 01/24] wip: rework CCParse --- src/core/CCParse.ml | 60 +++++++++++++++++++++++++++++++++++++++++--- src/core/CCParse.mli | 27 ++++++++++++-------- 2 files changed, 73 insertions(+), 14 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index 027dd46a..f8a15cc8 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -30,9 +30,16 @@ open CCShims_ | N (a,b) -> Printf.sprintf "N (%s, %s)" (pptree a) (pptree b) | L x -> Printf.sprintf "L %d" x - let errpptree = function - | Ok x -> "Ok " ^ pptree x + let errpp pp = function + | Ok x -> "Ok " ^ pp x | Error s -> "Error " ^ s + + let errpptree = errpp pptree + + let erreq eq x y = match x, y with + | Ok x, Ok y -> eq x y + | Error _ , Error _ -> true + | _ -> false *) (*$= & ~printer:errpptree @@ -125,6 +132,8 @@ module MemoTbl = struct | Ok of 'a end +(* TODO: [type position = {state: state; i: int}] and recompute line, col + on demand *) type position = int * int * int (* pos, line, column *) type parse_branch = (line_num * col_num * string option) list @@ -137,6 +146,7 @@ type state = { mutable branch: parse_branch; memo : MemoTbl.t; (* Memoization table, if any *) } +(* TODO: remove lnum/cnum, recompute them lazily in errors *) exception ParseError of parse_branch * (unit -> string) @@ -202,7 +212,17 @@ let backtrack st (i',l',c') = st.cnum <- c'; () +(* FIXME: + remove all backtracking stuff and instead, pass the state as parameter + to [ok] and [err], with an explicit offset that changes. *) + type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit +(** Takes the input and two continuations: + {ul + {- [ok] to call with the result when it's done} + {- [err] to call when the parser met an error} + } +*) let return : 'a -> 'a t = fun x _st ~ok ~err:_ -> ok x let pure = return @@ -500,23 +520,55 @@ module U = struct sep_ ~by:(skip_white *> string sep *> skip_white) p <* skip_white <* string stop + let int = + skip_white *> chars1_if (fun c -> is_num c || char_equal c '-') >>= fun s -> try return (int_of_string s) with Failure _ -> fail "expected an int" + let hexa_int = + (try_ (string "0x") <|> return "") *> + begin + chars1_if (function '0' .. '9' | 'a'..'f' | 'A'..'F' -> true | _ -> false) + >|= fun s -> + let i = ref 0 in + String.iter + (fun c -> + let n = match c with + | '0' .. '9' -> Char.code c - Char.code '0' + | 'a' .. 'f' -> Char.code c - Char.code 'a' + 10 + | 'A' .. 'F' -> Char.code c - Char.code 'A' + 10 + | _ -> assert false + in + i := !i * 16 + n) + s; + !i + end + + (*$= & ~printer:(errpp Q.Print.int) ~cmp:(erreq (=)) + (Ok 16) (parse_string U.hexa_int "0x10") + (Ok 16) (parse_string U.hexa_int "10") + (Error "") (parse_string U.hexa_int "x10") + (Error "") (parse_string U.hexa_int "0xz") + *) + let prepend_str c s = String.make 1 c ^ s let word = map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) let pair ?(start="(") ?(stop=")") ?(sep=",") p1 p2 = - string start *> skip_white *> + skip_white *> string start *> skip_white *> p1 >>= fun x1 -> skip_white *> string sep *> skip_white *> p2 >>= fun x2 -> - string stop *> return (x1,x2) + skip_white *> string stop *> return (x1,x2) + + (*$= & ~printer:Q.Print.(errpp (pair int int)) + (Ok(1,2)) U.(parse_string (pair int int) "(1 , 2 )") + *) let triple ?(start="(") ?(stop=")") ?(sep=",") p1 p2 p3 = string start *> skip_white *> diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index ecf90fa5..599c066c 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -3,6 +3,9 @@ (** {1 Very Simple Parser Combinators} + These combinators can be used to write very simple parsers, for example + to extract data from a line-oriented file. + {[ open CCParse;; @@ -70,13 +73,12 @@ val state_of_string : string -> state (** {2 Combinators} *) -type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit -(** Takes the input and two continuations: - {ul - {- [ok] to call with the result when it's done} - {- [err] to call when the parser met an error} - } - @raise ParseError in case of failure. *) +type 'a t +(** The abstract type of parsers that return a value of type ['a] (or fail). + + @raise ParseError in case of failure. + @since NEXT_RELEASE the type is private. +*) val return : 'a -> 'a t (** Always succeeds, without consuming its input. *) @@ -299,8 +301,8 @@ end (** {2 Utils} - This is useful to parse OCaml-like values in a simple way. *) - + This is useful to parse OCaml-like values in a simple way. + All the parsers are whitespace-insensitive (they skip whitespace). *) module U : sig val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t (** [list p] parses a list of [p], with the OCaml conventions for @@ -308,7 +310,12 @@ module U : sig Whitespace between items are skipped. *) val int : int t - (** Parse an int. *) + (** Parse an int in decimal representation. *) + + val hexa_int : int t + (** Parse an int int hexadecimal format. Accepts an optional [0x] prefix, + and ignores capitalization. + @since NEXT_RELEASE *) val word : string t (** Non empty string of alpha num, start with alpha. *) From 0ec40c2331a2d208366ab3f8ebee83701088cb96 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Tue, 4 May 2021 23:11:41 -0400 Subject: [PATCH 02/24] CCParse: heavy refactoring, many new functions --- src/core/CCParse.ml | 778 +++++++++++++++++++++++++++---------------- src/core/CCParse.mli | 195 +++++++---- 2 files changed, 627 insertions(+), 346 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index f8a15cc8..a069052b 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -16,13 +16,13 @@ open CCShims_ let ptree = fix @@ fun self -> skip_space *> - ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')') <|> (U.int >|= mk_leaf) ) let ptree' = fix_memo @@ fun self -> skip_space *> - ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')') <|> (U.int >|= mk_leaf) ) @@ -39,7 +39,7 @@ open CCShims_ let erreq eq x y = match x, y with | Ok x, Ok y -> eq x y | Error _ , Error _ -> true - | _ -> false + | _ -> false ;; *) (*$= & ~printer:errpptree @@ -56,7 +56,7 @@ open CCShims_ (*$R let p = U.list ~sep:"," U.word in let printer = function - | Ok l -> "Ok " ^ CCFormat.(to_string (list string)) l + | Ok l -> "Ok " ^ CCFormat.(to_string (Dump.list string_quoted)) l | Error s -> "Error " ^ s in assert_equal ~printer @@ -84,7 +84,7 @@ open CCShims_ let open CCParse.Infix in let module P = CCParse in - let parens p = P.try_ (P.char '(') *> p <* P.char ')' in + let parens p = P.char '(' *> p <* P.char ')' in let add = P.char '+' *> P.return (+) in let sub = P.char '-' *> P.return (-) in let mul = P.char '*' *> P.return ( * ) in @@ -94,7 +94,7 @@ open CCShims_ let chainl1 e op = P.fix (fun r -> - e >>= fun x -> P.try_ (op <*> P.return x <*> r) <|> P.return x) in + e >>= fun x -> (op <*> P.return x <*> r) <|> P.return x) in let expr : int P.t = P.fix (fun expr -> @@ -107,193 +107,270 @@ open CCShims_ () *) -type 'a or_error = ('a, string) result +module Error = struct + type t = { + msg: unit -> string; + str: string; + offset: int; (* offset in [e_str] *) + } -type line_num = int -type col_num = int + let get_loc_ (self:t) : int * int = + let i = ref 0 in + let continue = ref true in + let line = ref 1 in + let col = ref 1 in + while !continue && !i < self.offset do + match String.index_from self.str !i '\n' with + | exception Not_found -> + col := self.offset - !i; continue := false; + | j when j > self.offset -> + col := self.offset - !i; continue := false; + | j -> incr line; i := j+1; + done; + !line, !col -module MemoTbl = struct - module H = Hashtbl.Make(struct - type t = int * int (* id of parser, position *) - let equal ((a,b):t)(c,d) = a=c && b=d - let hash = Hashtbl.hash - end) + let line_and_column self = get_loc_ self + let msg self = self.msg() + let to_string self = + let line,col = get_loc_ self in + Printf.sprintf "at line %d, char %d:\n%s" line col (self.msg()) + + let pp out self = + let line,col = get_loc_ self in + Format.fprintf out "at line %d, char %d:@ %s" line col (self.msg()) +end + +type 'a or_error = ('a, Error.t) result + +module Memo_tbl = Hashtbl.Make(struct + type t = int * int (* id of parser, position *) + let equal ((a,b):t)(c,d) = a=c && b=d + let hash = Hashtbl.hash + end) + +module Memo_state = struct (* table of closures, used to implement universal type *) - type t = (unit -> unit) H.t lazy_t - - let create n = lazy (H.create n) + type t = (unit -> unit) Memo_tbl.t (* unique ID for each parser *) let id_ = ref 0 - - type 'a res = - | Fail of exn - | Ok of 'a end (* TODO: [type position = {state: state; i: int}] and recompute line, col on demand *) type position = int * int * int (* pos, line, column *) -type parse_branch = (line_num * col_num * string option) list - +(** Purely functional state passed around *) type state = { str: string; (* the input *) - mutable i: int; (* offset *) - mutable lnum : line_num; (* Line number *) - mutable cnum : col_num; (* Column number *) - mutable branch: parse_branch; - memo : MemoTbl.t; (* Memoization table, if any *) + i: int; (* offset in [input.str] *) + memo : Memo_state.t option ref; (* Memoization table, if any *) } -(* TODO: remove lnum/cnum, recompute them lazily in errors *) -exception ParseError of parse_branch * (unit -> string) - -let char_equal (a : char) b = Stdlib.(=) a b -let string_equal (a : string) b = Stdlib.(=) a b - -let rec string_of_branch l = - let pp_s () = function - | None -> "" - | Some s -> Format.sprintf "while parsing %s, " s - in - match l with - | [] -> "" - | [l,c,s] -> - Format.sprintf "@[%aat line %d, col %d@]" pp_s s l c - | (l,c,s) :: tail -> - Format.sprintf "@[%aat line %d, col %d@]@,%s" pp_s s l c (string_of_branch tail) +let[@inline] char_equal (a : char) b = Stdlib.(=) a b +let string_equal = String.equal +(* FIXME: printer for error let () = Printexc.register_printer (function | ParseError (b,msg) -> Some (Format.sprintf "@[%s@ %s@]" (msg()) (string_of_branch b)) | _ -> None) + *) -let const_ x () = x +let[@inline] const_str_ x () : string = x let state_of_string str = let s = { str; i=0; - lnum=1; - cnum=1; - branch=[]; - memo=MemoTbl.create 32; + memo=ref None; } in s -let is_done st = st.i = String.length st.str -let cur st = st.str.[st.i] +let[@inline] is_done st = st.i >= String.length st.str +let[@inline] cur st = st.str.[st.i] -let fail_ ~err st msg = - let b = (st.lnum, st.cnum, None) :: st.branch in - err (ParseError (b, msg)) +let mk_error_ st msg : Error.t = + {Error.msg; str=st.str; offset=st.i} -let next st ~ok ~err = - if st.i = String.length st.str - then fail_ ~err st (const_ "unexpected end of input") - else ( +(* consume one char, passing it to [ok]. *) +let consume_ st ~ok ~err = + if is_done st then ( + let msg = const_str_ "unexpected end of input" in + err (mk_error_ st msg) + ) else ( let c = st.str.[st.i] in - st.i <- st.i + 1; - if char_equal c '\n' - then (st.lnum <- st.lnum + 1; st.cnum <- 1) - else st.cnum <- st.cnum + 1; - ok c + ok {st with i=st.i + 1} c ) -let pos st = st.i, st.lnum, st.cnum - -let backtrack st (i',l',c') = - assert (0 <= i' && i' <= st.i); - st.i <- i'; - st.lnum <- l'; - st.cnum <- c'; - () - -(* FIXME: - remove all backtracking stuff and instead, pass the state as parameter - to [ok] and [err], with an explicit offset that changes. *) - -type 'a t = state -> ok:('a -> unit) -> err:(exn -> unit) -> unit +type 'a t = { + run: 'b. state -> ok:(state -> 'a -> 'b) -> err:(Error.t -> 'b) -> 'b; +} [@@unboxed] (** Takes the input and two continuations: {ul - {- [ok] to call with the result when it's done} + {- [ok] to call with the result and new state when it's done} {- [err] to call when the parser met an error} } *) -let return : 'a -> 'a t = fun x _st ~ok ~err:_ -> ok x +let return x : _ t = { + run=fun st ~ok ~err:_ -> ok st x +} + let pure = return -let (>|=) : 'a t -> ('a -> 'b) -> 'b t - = fun p f st ~ok ~err -> p st ~ok:(fun x -> ok (f x)) ~err -let (>>=) : 'a t -> ('a -> 'b t) -> 'b t - = fun p f st ~ok ~err -> p st ~ok:(fun x -> f x st ~ok ~err) ~err -let (<*>) : ('a -> 'b) t -> 'a t -> 'b t - = fun f x st ~ok ~err -> - f st ~ok:(fun f' -> x st ~ok:(fun x' -> ok (f' x')) ~err) ~err -let (<* ) : 'a t -> _ t -> 'a t - = fun x y st ~ok ~err -> - x st ~ok:(fun res -> y st ~ok:(fun _ -> ok res) ~err) ~err -let ( *>) : _ t -> 'a t -> 'a t - = fun x y st ~ok ~err -> - x st ~ok:(fun _ -> y st ~ok ~err) ~err + +let (>|=) (p: 'a t) f : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> ok st (f x)) + ~err +} + +let (>>=) (p:'a t) f : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> + let p2 = f x in + p2.run st ~ok ~err) + ~err +} + +let (<*>) (f:_ t) (a:_ t) : _ t = { + run=fun st ~ok ~err -> + f.run st + ~ok:(fun st f -> + a.run st ~ok:(fun st x -> ok st (f x)) ~err) + ~err +} + +let (<*) (a:_ t) (b:_ t) : _ t = { + run=fun st ~ok ~err -> + a.run st + ~ok:(fun st x -> + b.run st ~ok:(fun st _ -> ok st x) ~err) + ~err +} + +let ( *> ) (a:_ t) (b:_ t) : _ t = { + run=fun st ~ok ~err -> + a.run st + ~ok:(fun st _ -> + b.run st ~ok:(fun st x -> ok st x) ~err) + ~err +} let map f x = x >|= f let map2 f x y = pure f <*> x <*> y let map3 f x y z = pure f <*> x <*> y <*> z -let junk_ st = next st ~ok:ignore ~err:(fun _ -> assert false) +let junk_ (st:state) : state = + assert (st.i < String.length st.str); + {st with i=st.i + 1} -let eoi st ~ok ~err = +let eoi = { + run=fun st ~ok ~err -> if is_done st - then ok () - else fail_ ~err st (const_ "expected EOI") + then ok st () + else err (mk_error_ st (const_str_ "expected end of input")) +} -let fail msg st ~ok:_ ~err = fail_ ~err st (const_ msg) +let fail msg : _ t = { + run=fun st ~ok:_ ~err -> + err (mk_error_ st (const_str_ msg)) +} let failf msg = Printf.ksprintf fail msg -let parsing s p st ~ok ~err = - st.branch <- (st.lnum, st.cnum, Some s) :: st.branch; - p st - ~ok:(fun x -> st.branch <- List.tl st.branch; ok x) - ~err:(fun e -> st.branch <- List.tl st.branch; err e) +let parsing what p = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun e -> + let msg() = + Printf.sprintf "while parsing %s:\n%s" what (e.Error.msg ()) + in + err {e with Error.msg}) +} -let nop _ ~ok ~err:_ = ok() +let nop = { + run=fun st ~ok ~err:_ -> ok st (); +} -let char c = - let msg = Printf.sprintf "expected '%c'" c in - fun st ~ok ~err -> - next st - ~ok:(fun c' -> if char_equal c c' then ok c else fail_ ~err st (const_ msg)) ~err +let any_char = { + run=fun st ~ok ~err -> consume_ st ~ok ~err +} -let char_if p st ~ok ~err = - next st - ~ok:(fun c -> - if p c then ok c - else fail_ ~err st (fun () -> Printf.sprintf "unexpected char '%c'" c) - ) ~err +let char c : _ t = { + run=fun st ~ok ~err -> + consume_ st + ~ok:(fun st c2 -> + if char_equal c c2 then ok st c + else ( + let msg() = Printf.sprintf "expected '%c', got '%c'" c (cur st) in + err (mk_error_ st msg) + )) + ~err +} -let chars_if p st ~ok ~err:_ = - let i = st.i in - let len = ref 0 in - while not (is_done st) && p (cur st) do junk_ st; incr len done; - ok (String.sub st.str i !len) +let char_if ?descr p = { + run=fun st ~ok ~err -> + consume_ st + ~ok:(fun st c -> + if p c then ok st c + else ( + let msg() = + let rest = match descr with + | None -> "" + | Some d -> Printf.sprintf ", expected %s" d + in + Printf.sprintf "unexpected char '%c'%s" c rest + in + err (mk_error_ st msg) + )) + ~err +} -let chars1_if p st ~ok ~err = - chars_if p st - ~ok:(fun s -> - if string_equal s "" - then fail_ ~err st (const_ "unexpected sequence of chars") - else ok s) - ~err +let chars_if p = { + run=fun st ~ok ~err:_ -> + let i0 = st.i in + let i = ref i0 in + while + let st = {st with i = !i} in + not (is_done st) && p (cur st) + do + incr i; + done; + ok {st with i = !i} (String.sub st.str i0 (!i - i0)) +} -let rec skip_chars p st ~ok ~err = - if not (is_done st) && p (cur st) then ( - junk_ st; - skip_chars p st ~ok ~err - ) else ok() +let chars1_if ?descr p = { + run=fun st ~ok ~err -> + (chars_if p).run st + ~ok:(fun st s -> + if string_equal s "" + then ( + let msg() = + let what = match descr with + | None -> "" + | Some d -> Printf.sprintf " for %s" d + in + Printf.sprintf "expected non-empty sequence of chars%s" what + in + err (mk_error_ st msg) + ) else ok st s) + ~err +} + +let skip_chars p : _ t = + let rec self = { + run=fun st ~ok ~err -> + if not (is_done st) && p (cur st) then ( + let st = junk_ st in + self.run st ~ok ~err + ) else ok st () + } + in + self let is_alpha = function | 'a' .. 'z' | 'A' .. 'Z' -> true @@ -308,77 +385,136 @@ let is_white = function ' ' | '\t' | '\n' -> true | _ -> false let space = char_if is_space let white = char_if is_white -let endline st ~ok ~err = - next st - ~ok:(function - | '\n' as c -> ok c - | _ -> fail_ ~err st (const_ "expected end-of-line")) - ~err - +let endline = + char_if ~descr:"end-of-line ('\\n')" (function '\n' -> true | _ -> false) let skip_space = skip_chars is_space let skip_white = skip_chars is_white -let (<|>) : 'a t -> 'a t -> 'a t - = fun x y st ~ok ~err -> - let i = st.i in - x st ~ok - ~err:(fun e -> - let j = st.i in - if i=j then y st ~ok ~err (* try [y] *) - else err e (* fail *) +let or_ (p1:'a t) (p2:'a t) : _ t = { + run=fun st ~ok ~err -> + p1.run st ~ok + ~err:(fun _e -> p2.run st ~ok ~err) +} +let (<|>) = or_ + +let (|||) a b = map2 (fun x y ->x,y) a b + +let try_or p1 ~f ~else_:p2 = { + run=fun st ~ok ~err -> + p1.run st + ~ok:(fun st x -> (f x).run st ~ok ~err) + ~err:(fun _ -> p2.run st ~ok ~err) +} + +let suspend f = { + run=fun st ~ok ~err -> + let p = f () in + p.run st ~ok ~err +} + +let () (p:'a t) msg : _ t = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun _e -> err (mk_error_ st (const_str_ msg))) +} + +(* read [len] chars at once *) +let any_chars len : _ t = { + run=fun st ~ok ~err -> + if st.i + len <= String.length st.str then ( + let s = String.sub st.str st.i len in + let st = {st with i = st.i + len} in + ok st s + ) else ( + let msg() = + Printf.sprintf "expected to be able to consume %d chars" len + in + err (mk_error_ st msg) + ) +} + +let exact s = { + run=fun st ~ok ~err -> + (* parse a string of length [String.length s] and compare with [s] *) + (any_chars (String.length s)).run st + ~ok:(fun st s2 -> + if string_equal s s2 then ok st s + else ( + let msg() = Printf.sprintf "expected %S, got %S" s s2 in + err (mk_error_ st msg) + ) + ) + ~err +} + +let string = exact + +let fix f = + let rec self = { + run=fun st ~ok ~err -> + (Lazy.force f_self).run st ~ok ~err + } + and f_self = lazy (f self) in + self + +let try_ p : _ t = { + run=fun st ~ok ~err:_ -> + p.run st + ~ok:(fun st x -> ok st (Some x)) + ~err:(fun _ -> ok st None) +} + +let optional p : _ t = { + run=fun st ~ok ~err:_ -> + p.run st + ~ok:(fun st _x -> ok st ()) + ~err:(fun _ -> ok st ()) +} + +let many_until ~until p : _ t = + fix + (fun self -> + try_or until ~f:(fun _ -> pure []) + ~else_:( + p >>= fun x -> + self >|= fun l -> x :: l + )) + +let many p : _ t = + fix + (fun self -> + try_or p + ~f:(fun x -> self >|= fun tl -> x :: tl) + (pure [])) + +(* +(* parse many [p], as a difference list *) +let many_rec_ p : (_ list -> _ list) t = + let rec self = { + run=fun st ~ok ~err -> + if is_done st then ok st (fun l->l) (* empty list *) + else ( + p.run st + ~ok:(fun st x -> + self.run st + ~ok:(fun st f -> ok st (fun l -> x :: f l)) + ~err) + ~err ) + } in + self -let try_ : 'a t -> 'a t - = fun p st ~ok ~err -> - let i = pos st in - p st ~ok - ~err:(fun e -> - backtrack st i; - err e) - -let suspend f st ~ok ~err = f () st ~ok ~err - -let () : 'a t -> string -> 'a t - = fun x msg st ~ok ~err -> - let i = st.i in - x st ~ok - ~err:(fun e -> - if st.i = i - then fail_ ~err st (fun () -> msg) - else err e) - -let string s st ~ok ~err = - let rec check i = - if i = String.length s then ok s - else - next st - ~ok:(fun c -> - if char_equal c s.[i] - then check (i+1) - else fail_ ~err st (fun () -> Printf.sprintf "expected \"%s\"" s)) - ~err - in - check 0 - -let rec many_rec : 'a t -> 'a list -> 'a list t = fun p acc st ~ok ~err -> - if is_done st then ok(List.rev acc) - else - p st - ~ok:(fun x -> - let i = pos st in - let acc = x :: acc in - many_rec p acc st ~ok - ~err:(fun _ -> - backtrack st i; - ok(List.rev acc)) - ) ~err - -let many : 'a t -> 'a list t - = fun p st ~ok ~err -> many_rec p [] st ~ok ~err +let many p : _ t = { + run=fun st ~ok ~err -> + (many_rec_ p).run st + ~ok:(fun st f -> ok st (f [])) + ~err +} + *) (*$R let p0 = skip_white *> U.int in - let p = (skip_white *> char '(' *> many p0) <* try_ (skip_white <* char ')') in + let p = (skip_white *> char '(' *> many p0) <* (skip_white <* char ')') in let printer = CCFormat.(to_string @@ Dump.result @@ Dump.list int) in assert_equal ~printer (Ok [1;2;3]) (parse_string p "(1 2 3)"); @@ -387,91 +523,165 @@ let many : 'a t -> 'a list t *) -let many1 : 'a t -> 'a list t = - fun p st ~ok ~err -> - p st ~ok:(fun x -> many_rec p [x] st ~ok ~err) ~err - -let rec skip p st ~ok ~err = - let i = pos st in - p st - ~ok:(fun _ -> skip p st ~ok ~err) - ~err:(fun _ -> - backtrack st i; - ok() - ) - -(* by (sep1 ~by p) *) -let rec sep_rec ~by p = try_ by *> sep1 ~by p - -and sep1 ~by p = +let many1 p = p >>= fun x -> - (sep_rec ~by p >|= fun tl -> x::tl) - <|> return [x] + many p >|= fun l -> x :: l + +(* skip can be made efficient by not allocating intermediate parsers *) +let skip p : _ t = + let rec self = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st _ -> self.run st ~ok ~err) + ~err:(fun _ -> + ok st ()) + } in + self + +let sep_until ~until ~by p = + let rec read_p = lazy ( + p >>= fun x -> + (until *> pure [x]) + <|> + (by *> (Lazy.force read_p >|= fun tl -> x :: tl)) + ) in + (until *> pure []) + <|> (Lazy.force read_p) let sep ~by p = - (try_ p >>= fun x -> - (sep_rec ~by p >|= fun tl -> x::tl) - <|> return [x]) - <|> return [] + let rec read_p = lazy ( + try_or p + ~f:(fun x -> + (eoi *> pure [x]) + <|> + try_or by + ~f:(fun _ -> Lazy.force read_p >|= fun tl -> x :: tl) + (pure [x])) + (pure []) + ) in + Lazy.force read_p -let fix f = - let rec p st ~ok ~err = f p st ~ok ~err in - p +(*$inject + let aword = chars1_if (function 'a'..'z'|'A'..'Z'->true|_ -> false);; +*) +(*$= & ~printer:(errpp Q.Print.(list string)) +(Ok ["a";"b";"c"]) \ + (parse_string (optional (char '/') *> sep ~by:(char '/') aword) "/a/b/c") +(Ok ["a";"b";"c"]) \ + (parse_string (optional (char '/') *> sep ~by:(char '/') aword) "a/b/c") +*) -let memo (type a) (p:a t):a t = - let id = !MemoTbl.id_ in - incr MemoTbl.id_; +let sep1 ~by p = + p >>= fun x -> + sep ~by p >|= fun tl -> + x :: tl + +let line : _ t = { + run=fun st ~ok ~err -> + if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI")) + else ( + match String.index_from st.str st.i '\n' with + | j -> + let s = String.sub st.str st.i (j - st.i) in + ok {st with i=j+1} s + | exception Not_found -> + err (mk_error_ st (const_str_ "unterminated line")) + ) +} + +(*$= + (Ok "1234") (parse_string line "1234\nyolo") + (Ok ("1234", "yolo")) (parse_string (line ||| line) "1234\nyolo\nswag") +*) + +(* parse a string [s] using [p_sub], then parse [s] using [p]. + The result is that of parsing [s] using [p], but the state is + the one after using [p_sub], and errors are translated back into the context + of [p_sub]. + This can be useful for example in [p_sub line some_line_parser]. *) +let parse_sub_ p_sub p : _ t = { + run=fun st0 ~ok ~err -> + let p = p <* eoi in (* make sure [p] reads all *) + p_sub.run st0 + ~ok:(fun st1 s -> + p.run (state_of_string s) + ~ok:(fun _ r -> ok st1 r) + ~err:(fun e -> + err {e with Error.str=st0.str; offset=e.Error.offset + st0.i})) + ~err +} + +let each_line p : _ t = + fix + (fun self -> + try_or eoi + ~f:(fun _ -> pure []) + (parse_sub_ line p >>= fun x -> + self >|= fun tl -> x :: tl)) + +(*$= & ~printer:(errpp Q.Print.(list @@ list int)) + (Ok ([[1;1];[2;2];[3;3]])) \ + (parse_string (each_line (sep ~by:skip_space U.int)) "1 1\n2 2\n3 3\n") +*) + +let memo (type a) (p:a t) : a t = + let id = !Memo_state.id_ in + incr Memo_state.id_; let r = ref None in (* used for universal encoding *) - fun st ~ok ~err -> - let i = st.i in - let (lazy tbl) = st.memo in - try - let f = MemoTbl.H.find tbl (i,id) in - (* extract hidden value *) + + {run=fun st ~ok ~err -> + let tbl = match !(st.memo) with + | Some t -> t + | None -> + let tbl = Memo_tbl.create 32 in + st.memo := Some tbl; + tbl + in + + match r := None; - f (); - begin match !r with - | None -> assert false - | Some (MemoTbl.Ok x) -> ok x - | Some (MemoTbl.Fail e) -> err e - end - with Not_found -> - (* parse, and save *) - p st - ~ok:(fun x -> - MemoTbl.H.replace tbl (i,id) (fun () -> r := Some (MemoTbl.Ok x)); - ok x) - ~err:(fun e -> - MemoTbl.H.replace tbl (i,id) (fun () -> r := Some (MemoTbl.Fail e)); - err e) + let f = Memo_tbl.find tbl (st.i, id) in + f(); + !r + with + | None -> assert false + | Some (Ok (st,x)) -> ok st x + | Some (Error e) -> err e + | exception Not_found -> + (* parse, and save *) + p.run st + ~ok:(fun st' x -> + Memo_tbl.replace tbl (st.i,id) (fun () -> r := Some (Ok (st',x))); + ok st' x) + ~err:(fun e -> + Memo_tbl.replace tbl (st.i,id) (fun () -> r := Some (Error e)); + err e) + } let fix_memo f = - let rec p = - let p' = lazy (memo p) in - fun st ~ok ~err -> f (Lazy.force p') st ~ok ~err - in + let rec p = { + run=fun st ~ok ~err -> (Lazy.force p').run st ~ok ~err + } + and p' = lazy (memo (f p)) in p -let get_lnum = fun st ~ok ~err:_ -> ok st.lnum -let get_cnum = fun st ~ok ~err:_ -> ok st.cnum -let get_pos = fun st ~ok ~err:_ -> ok (st.lnum, st.cnum) +exception ParseError of Error.t -let parse_exn p st = - let res = ref None in - p st ~ok:(fun x -> res := Some x) ~err:(fun e -> raise e); - match !res with - | None -> assert false - | Some x -> x +let stringify_result = function + | Ok _ as x -> x + | Error e -> Error (Error.to_string e) -let exn_to_err e = Error (Printexc.to_string e) +let parse_string_exn p s = + p.run (state_of_string s) + ~ok:(fun _st x -> x) + ~err:(fun e -> raise (ParseError e)) -let parse p st = - try Ok (parse_exn p st) - with e -> exn_to_err e +let parse_string_e p s = + p.run (state_of_string s) + ~ok:(fun _st x -> Ok x) + ~err:(fun e -> Error e) -let parse_string_exn p s = parse_exn p (state_of_string s) - -let parse_string p s = parse p (state_of_string s) +let parse_string p s = parse_string_e p s |> stringify_result let read_all_ ic = let buf = Buffer.create 1024 in @@ -487,20 +697,19 @@ let read_all_ ic = end; Buffer.contents buf -let parse_file_exn p file = +let parse_file_e p file = let ic = open_in file in - let st = state_of_string (read_all_ ic) in - try - let res = parse_exn p st in - close_in ic; - res - with e -> - close_in_noerr ic; - raise e + let s = read_all_ ic in + let r = parse_string_e p s in + close_in ic; + r -let parse_file p file = - try Ok (parse_file_exn p file) - with e -> exn_to_err e +let parse_file p file = parse_file_e p file |> stringify_result + +let parse_file_exn p file = + match parse_file_e p file with + | Ok x -> x + | Error e -> raise (ParseError e) module Infix = struct let (>|=) = (>|=) @@ -509,6 +718,7 @@ module Infix = struct let (<* ) = (<* ) let ( *>) = ( *>) let (<|>) = (<|>) + let (|||) = (|||) let () = () end @@ -517,19 +727,19 @@ module U = struct let list ?(start="[") ?(stop="]") ?(sep=";") p = string start *> skip_white *> - sep_ ~by:(skip_white *> string sep *> skip_white) p <* - skip_white <* string stop - + sep_until + ~until:(skip_white <* string stop) + ~by:(skip_white *> string sep *> skip_white) p let int = skip_white *> - chars1_if (fun c -> is_num c || char_equal c '-') + chars1_if ~descr:"integer" (fun c -> is_num c || char_equal c '-') >>= fun s -> try return (int_of_string s) with Failure _ -> fail "expected an int" let hexa_int = - (try_ (string "0x") <|> return "") *> + (exact "0x" <|> return "") *> begin chars1_if (function '0' .. '9' | 'a'..'f' | 'A'..'F' -> true | _ -> false) >|= fun s -> @@ -583,5 +793,5 @@ end include CCShimsMkLet_.Make(struct type nonrec 'a t = 'a t include Infix - let monoid_product a1 a2 = pure (fun x y ->x,y) <*> a1 <*> a2 + let monoid_product = (|||) end) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 599c066c..1a2d5aa3 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -51,25 +51,34 @@ *) -type 'a or_error = ('a, string) result +(** {2 Errors} + @since NEXT_RELEASE *) +module Error : sig + type t + (** A parse error. + @since NEXT_RELEASE *) -type line_num = int -type col_num = int + val line_and_column : t -> int * int + (** Line and column numbers of the error position. *) -type parse_branch + val msg : t -> string -val string_of_branch : parse_branch -> string + val to_string : t -> string + (** Prints the error *) -exception ParseError of parse_branch * (unit -> string) -(** parsing branch * message. *) + val pp : Format.formatter -> t -> unit + (** Pretty prints the error *) +end + +type 'a or_error = ('a, Error.t) result +(* TODO: use [('a, error) result] instead, with easy conversion to [('a, string) result] *) + +exception ParseError of Error.t (** {2 Input} *) type position - -type state - -val state_of_string : string -> state +(* TODO: make a module Position: sig type t val line : t -> int val col : t -> int *) (** {2 Combinators} *) @@ -111,6 +120,10 @@ val ( *>) : _ t -> 'a t -> 'a t (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The results of [a] is ignored. *) +val (|||) : 'a t -> 'b t -> ('a * 'b) t +(** [a ||| b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) + val fail : string -> 'a t (** [fail msg] fails with the given message. It can trigger a backtrack. *) @@ -127,17 +140,28 @@ val eoi : unit t val nop : unit t (** Succeed with [()]. *) +val any_char : char t +(** [any_char] parses any character. + It still fails if the end of input was reached. + @since NEXT_RELEASE *) + +val any_chars : int -> string t +(** [any_chars len] parses exactly [len] characters from the input. + @since NEXT_RELEASE *) + val char : char -> char t (** [char c] parses the character [c] and nothing else. *) -val char_if : (char -> bool) -> char t -(** [char_if f] parses a character [c] if [f c = true]. *) +val char_if : ?descr:string -> (char -> bool) -> char t +(** [char_if f] parses a character [c] if [f c = true]. + @param descr describes what kind of character was expected *) val chars_if : (char -> bool) -> string t (** [chars_if f] parses a string of chars that satisfy [f]. *) -val chars1_if : (char -> bool) -> string t -(** Like {!chars_if}, but only non-empty strings. *) +val chars1_if : ?descr:string -> (char -> bool) -> string t +(** Like {!chars_if}, but only non-empty strings. + @param descr describes what kind of character was expected *) val endline : char t (** Parse '\n'. *) @@ -180,14 +204,10 @@ val (<|>) : 'a t -> 'a t -> 'a t to avoid wrapping large parsers with {!try_}). *) val () : 'a t -> string -> 'a t -(** [a msg] behaves like [a], but if [a] fails without - consuming any input, it fails with [msg] - instead. Useful as the last choice in a series of [<|>]: - [a <|> b <|> c "expected a|b|c"]. *) - -val try_ : 'a t -> 'a t -(** [try_ p] tries to parse like [p], but backtracks if [p] fails. - Useful in combination with [<|>]. *) +(** [a msg] behaves like [a], but if [a] fails, + [a ]. For example: + [a <|> b <|> c "expected one of a, b, c"]. *) val suspend : (unit -> 'a t) -> 'a t (** [suspend f] is the same as [f ()], but evaluates [f ()] only @@ -196,11 +216,46 @@ val suspend : (unit -> 'a t) -> 'a t val string : string -> string t (** [string s] parses exactly the string [s], and nothing else. *) +val exact : string -> string t +(** Alias to {!string}. + @since NEXT_RELEASE *) + val many : 'a t -> 'a list t -(** [many p] parses a list of [p], eagerly (as long as possible). *) +(** [many p] parses [p] repeatedly, until [p] fails, and + collects the results into a list. *) + +val optional : _ t -> unit t +(** [optional p] tries to parse [p], and return [()] whether it + succeeded or failed. Cannot fail. + @since NEXT_RELEASE *) + +val try_ : 'a t -> 'a option t +(** [try_ p] tries to parse using [p], and return [Some x] if [p] + succeeded with [x]. Otherwise it returns [None]. This cannot fail. + @since NEXT_RELEASE *) + +val many_until : until:_ t -> 'a t -> 'a list t +(** [many_until ~until p] parses as many [p] as it can until + the [until] parser successfully returns. + If [p] fails before that then [many_until ~until p] fails as well. + Typically [until] can be a closing ')' or another termination condition. + @since NEXT_RELEASE *) + +val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t +(** [try_or p1 ~f p2] attempts to parse [x] using [p1], + and then becomes [f x]. + If [p1] fails, then it becomes [p2]. + @since NEXT_RELEASE +*) + +val or_ : 'a t -> 'a t -> 'a t +(** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2] + from the same position. + @since NEXT_RELEASE *) val many1 : 'a t -> 'a list t -(** Parse a non-empty list. *) +(** [many1 p] is like [many p] excepts it fails if the + list is empty (i.e. it needs [p] to succeed at least once). *) val skip : _ t -> unit t (** [skip p] parses zero or more times [p] and ignores its result. *) @@ -208,9 +263,23 @@ val skip : _ t -> unit t val sep : by:_ t -> 'a t -> 'a list t (** [sep ~by p] parses a list of [p] separated by [by]. *) +(* TODO: lookahead? *) + +val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t +(** Same as {!sep} but stop when [until] parses successfully. + @since NEXT_RELEASE *) + val sep1 : by:_ t -> 'a t -> 'a list t (** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *) +val line : string t +(** Parse a line, '\n' excluded. + @since NEXT_RELEASE *) + +val each_line : 'a t -> 'a list t +(** [each_line p] runs [p] on each line of the input. + @since NEXT_RELEASE *) + val fix : ('a t -> 'a t) -> 'a t (** Fixpoint combinator. *) @@ -227,50 +296,45 @@ val memo : 'a t -> 'a t val fix_memo : ('a t -> 'a t) -> 'a t (** Like {!fix}, but the fixpoint is memoized. *) -val get_lnum : int t -(** Reflect the current line number. *) - -val get_cnum : int t -(** Reflect the current column number. *) - -val get_pos : (int * int) t -(** Reflect the current (line, column) numbers. *) - (** {2 Parse} Those functions have a label [~p] on the parser, since 0.14. *) -val parse : 'a t -> state -> 'a or_error -(** [parse p st] applies [p] on the input, and returns [Ok x] if - [p] succeeds with [x], or [Error s] otherwise. *) +val stringify_result : 'a or_error -> ('a, string) result +(** Turn a {!Error.t}-oriented result into a more basic string result. + @since NEXT_RELEASE *) -val parse_exn : 'a t -> state -> 'a -(** Unsafe version of {!parse}. - @raise ParseError if it fails. *) +val parse_string : 'a t -> string -> ('a, string) result +(** Parse a string using the parser. *) -val parse_string : 'a t -> string -> 'a or_error -(** Specialization of {!parse} for string inputs. *) +val parse_string_e : 'a t -> string -> 'a or_error +(** Version of {!parse_string} that returns a more detailed error. *) val parse_string_exn : 'a t -> string -> 'a (** @raise ParseError if it fails. *) -val parse_file : 'a t -> string -> 'a or_error -(** [parse_file p file] parses [file] with [p] by opening the file - and reading it whole. *) +val parse_file : 'a t -> string -> ('a, string) result +(** [parse_file p filename] parses file named [filename] with [p] + by opening the file and reading it whole. *) + +val parse_file_e : 'a t -> string -> 'a or_error +(** Version of {!parse_file} that returns a more detailed error. *) val parse_file_exn : 'a t -> string -> 'a -(** @raise ParseError if it fails. *) +(** Same as {!parse_file}, but + @raise ParseError if it fails. *) (** {2 Infix} *) module Infix : sig val (>|=) : 'a t -> ('a -> 'b) -> 'b t - (** Map. *) + (** Map. [p >|= f] parses an item [x] using [p], + and returns [f x]. *) val (>>=) : 'a t -> ('a -> 'b t) -> 'b t - (** Monadic bind. - [p >>= f] results in a new parser which behaves as [p] then, + (** Monadic bind. + [p >>= f] results in a new parser which behaves as [p] then, in case of success, applies [f] to the result. *) val (<*>) : ('a -> 'b) t -> 'a t -> 'b t @@ -282,20 +346,21 @@ module Infix : sig val ( *>) : _ t -> 'a t -> 'a t (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - results of [a] is ignored. *) + result of [a] is ignored. *) val (<|>) : 'a t -> 'a t -> 'a t - (** [a <|> b] tries to parse [a], and if [a] fails without - consuming any input, backtracks and tries - to parse [b], otherwise it fails as [a]. - See {!try_} to ensure [a] does not consume anything (but it is best - to avoid wrapping large parsers with {!try_}). *) + (** [a <|> b] tries to parse [a], and if [a] fails, it backtracks and tries + to parse [b]. + Alias to {!or_} *) + + val (|||) : 'a t -> 'b t -> ('a * 'b) t + (** [a ||| b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) val () : 'a t -> string -> 'a t - (** [a msg] behaves like [a], but if [a] fails without - consuming any input, it fails with [msg] - instead. Useful as the last choice in a series of [<|>]: - [a <|> b <|> c "expected a|b|c"]. *) + (** [a msg] behaves like [a], but if [a] fails, + it fails with [msg] instead. Useful as the last choice in a series of + [<|>]: [a <|> b <|> c "expected a|b|c"]. *) end @@ -306,9 +371,12 @@ end module U : sig val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t (** [list p] parses a list of [p], with the OCaml conventions for - start token "\[", stop token "\]" and separator ";". + start token "[", stop token "]" and separator ";". Whitespace between items are skipped. *) + (* TODO: parse option? *) + (* TODO: split on whitespace? *) + val int : int t (** Parse an int in decimal representation. *) @@ -320,14 +388,17 @@ module U : sig val word : string t (** Non empty string of alpha num, start with alpha. *) + (* TODO: boolean literal *) + (* TODO: quoted string *) + val pair : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'b t -> ('a * 'b) t - (** Parse a pair using OCaml whitespace conventions. + (** Parse a pair using OCaml syntactic conventions. The default is "(a, b)". *) val triple : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t - (** Parse a triple using OCaml whitespace conventions. + (** Parse a triple using OCaml syntactic conventions. The default is "(a, b, c)". *) end From 171b4ddcd9c33e38fe3725c76b3e505db26bf09a Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Wed, 12 May 2021 09:20:31 -0400 Subject: [PATCH 03/24] parse: deprecate `try_`, rename new function `try_opt` --- src/core/CCParse.ml | 4 +++- src/core/CCParse.mli | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index a069052b..d26d86b3 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -457,7 +457,9 @@ let fix f = and f_self = lazy (f self) in self -let try_ p : _ t = { +let try_ p = p + +let try_opt p : _ t = { run=fun st ~ok ~err:_ -> p.run st ~ok:(fun st x -> ok st (Some x)) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 1a2d5aa3..cba724aa 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -229,7 +229,15 @@ val optional : _ t -> unit t succeeded or failed. Cannot fail. @since NEXT_RELEASE *) -val try_ : 'a t -> 'a option t +val try_ : 'a t -> 'a t +(** [try_ p] is just like [p] (it used to play a role in backtracking + semantics but no more). + + @deprecated since NEXT_RELEASE it can just be removed. See {!try_opt} if you want + to detect failure. *) +[@@deprecated "plays no role anymore"] + +val try_opt : 'a t -> 'a option t (** [try_ p] tries to parse using [p], and return [Some x] if [p] succeeded with [x]. Otherwise it returns [None]. This cannot fail. @since NEXT_RELEASE *) From 37af485971cc0b5e39780b405940e42f11aa01e5 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 15:08:13 -0400 Subject: [PATCH 04/24] parse: expose Position module, add `or_`, `both`, `lookahead`, `U.bool` --- src/core/CCParse.ml | 174 +++++++++++++++++++++------------ src/core/CCParse.mli | 224 ++++++++++++++++++++++++++----------------- 2 files changed, 248 insertions(+), 150 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index d26d86b3..22e6af88 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -107,37 +107,64 @@ open CCShims_ () *) -module Error = struct - type t = { - msg: unit -> string; - str: string; - offset: int; (* offset in [e_str] *) - } +(* TODO: [type position = {state: state; i: int}] and recompute line, col + on demand *) +type position = { + pos_buffer: string; + pos_offset: int; + mutable pos_lc: (int * int) option; +} - let get_loc_ (self:t) : int * int = +module Position = struct + type t = position + + (* actually re-compute line and column from the buffer *) + let compute_line_and_col_ (self:t) : int * int = let i = ref 0 in let continue = ref true in let line = ref 1 in let col = ref 1 in - while !continue && !i < self.offset do - match String.index_from self.str !i '\n' with + while !continue && !i < self.pos_offset do + match String.index_from self.pos_buffer !i '\n' with | exception Not_found -> - col := self.offset - !i; continue := false; - | j when j > self.offset -> - col := self.offset - !i; continue := false; + col := self.pos_offset - !i; continue := false; + | j when j > self.pos_offset -> + col := self.pos_offset - !i; continue := false; | j -> incr line; i := j+1; done; !line, !col - let line_and_column self = get_loc_ self + let line_and_column self = + match self.pos_lc with + | Some tup -> tup + | None -> + let tup = compute_line_and_col_ self in + self.pos_lc <- Some tup; (* save *) + tup + + let line self = fst (line_and_column self) + let column self = snd (line_and_column self) + let pp out self = + let l, c = line_and_column self in + Format.fprintf out "at line %d, column %d" l c +end + +module Error = struct + type t = { + msg: unit -> string; + pos: position; + } + + let position self = self.pos + let line_and_column self = Position.line_and_column self.pos let msg self = self.msg() let to_string self = - let line,col = get_loc_ self in + let line,col = line_and_column self in Printf.sprintf "at line %d, char %d:\n%s" line col (self.msg()) let pp out self = - let line,col = get_loc_ self in + let line,col = line_and_column self in Format.fprintf out "at line %d, char %d:@ %s" line col (self.msg()) end @@ -157,10 +184,6 @@ module Memo_state = struct let id_ = ref 0 end -(* TODO: [type position = {state: state; i: int}] and recompute line, col - on demand *) -type position = int * int * int (* pos, line, column *) - (** Purely functional state passed around *) type state = { str: string; (* the input *) @@ -192,8 +215,8 @@ let state_of_string str = let[@inline] is_done st = st.i >= String.length st.str let[@inline] cur st = st.str.[st.i] -let mk_error_ st msg : Error.t = - {Error.msg; str=st.str; offset=st.i} +let pos_of_st_ st : position = {pos_buffer=st.str; pos_offset=st.i; pos_lc=None} +let mk_error_ st msg : Error.t = {Error.msg; pos=pos_of_st_ st} (* consume one char, passing it to [ok]. *) let consume_ st ~ok ~err = @@ -221,14 +244,14 @@ let return x : _ t = { let pure = return -let (>|=) (p: 'a t) f : _ t = { +let map f (p: 'a t) : _ t = { run=fun st ~ok ~err -> p.run st ~ok:(fun st x -> ok st (f x)) ~err } -let (>>=) (p:'a t) f : _ t = { +let bind f (p:'a t) : _ t = { run=fun st ~ok ~err -> p.run st ~ok:(fun st x -> @@ -237,7 +260,7 @@ let (>>=) (p:'a t) f : _ t = { ~err } -let (<*>) (f:_ t) (a:_ t) : _ t = { +let ap (f:_ t) (a:_ t) : _ t = { run=fun st ~ok ~err -> f.run st ~ok:(fun st f -> @@ -245,7 +268,7 @@ let (<*>) (f:_ t) (a:_ t) : _ t = { ~err } -let (<*) (a:_ t) (b:_ t) : _ t = { +let ap_left (a:_ t) (b:_ t) : _ t = { run=fun st ~ok ~err -> a.run st ~ok:(fun st x -> @@ -253,7 +276,7 @@ let (<*) (a:_ t) (b:_ t) : _ t = { ~err } -let ( *> ) (a:_ t) (b:_ t) : _ t = { +let ap_right (a:_ t) (b:_ t) : _ t = { run=fun st ~ok ~err -> a.run st ~ok:(fun st _ -> @@ -261,7 +284,47 @@ let ( *> ) (a:_ t) (b:_ t) : _ t = { ~err } -let map f x = x >|= f +let or_ (p1:'a t) (p2:'a t) : _ t = { + run=fun st ~ok ~err -> + p1.run st ~ok + ~err:(fun _e -> p2.run st ~ok ~err) +} + +let both a b = { + run=fun st ~ok ~err -> + a.run st + ~ok:(fun st xa -> + b.run st ~ok:(fun st xb -> ok st (xa,xb)) ~err) + ~err +} + +let set_error_message msg (p:'a t) : _ t = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun _e -> err (mk_error_ st (const_str_ msg))) +} + + +module Infix = struct + let[@inline] (>|=) p f = map f p + let[@inline] (>>=) p f = bind f p + let (<*>) = ap + let (<* ) = ap_left + let ( *>) = ap_right + let (<|>) = or_ + let (|||) = both + let[@inline] () p msg = set_error_message msg p + + include CCShimsMkLet_.Make(struct + type nonrec 'a t = 'a t + let (>>=) = (>>=) + let (>|=) = (>|=) + let monoid_product = both + end) +end + +include Infix + let map2 f x y = pure f <*> x <*> y let map3 f x y z = pure f <*> x <*> y <*> z @@ -390,15 +453,6 @@ let endline = let skip_space = skip_chars is_space let skip_white = skip_chars is_white -let or_ (p1:'a t) (p2:'a t) : _ t = { - run=fun st ~ok ~err -> - p1.run st ~ok - ~err:(fun _e -> p2.run st ~ok ~err) -} -let (<|>) = or_ - -let (|||) a b = map2 (fun x y ->x,y) a b - let try_or p1 ~f ~else_:p2 = { run=fun st ~ok ~err -> p1.run st @@ -412,12 +466,6 @@ let suspend f = { p.run st ~ok ~err } -let () (p:'a t) msg : _ t = { - run=fun st ~ok ~err -> - p.run st ~ok - ~err:(fun _e -> err (mk_error_ st (const_str_ msg))) -} - (* read [len] chars at once *) let any_chars len : _ t = { run=fun st ~ok ~err -> @@ -578,6 +626,17 @@ let sep1 ~by p = sep ~by p >|= fun tl -> x :: tl +let lookahead p : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun _st x -> ok st x) (* discard old state *) + ~err +} + +(*$= & ~printer:(errpp Q.Print.(string)) + (Ok "abc") (parse_string (lookahead (string "ab") *> (string "abc")) "abcd") +*) + let line : _ t = { run=fun st ~ok ~err -> if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI")) @@ -609,7 +668,13 @@ let parse_sub_ p_sub p : _ t = { p.run (state_of_string s) ~ok:(fun _ r -> ok st1 r) ~err:(fun e -> - err {e with Error.str=st0.str; offset=e.Error.offset + st0.i})) + let pos = e.pos in + let pos = { + pos_buffer=pos.pos_buffer; + pos_offset=pos.pos_offset + st0.i; + pos_lc=None; + } in + err {e with pos})) ~err } @@ -713,17 +778,6 @@ let parse_file_exn p file = | Ok x -> x | Error e -> raise (ParseError e) -module Infix = struct - let (>|=) = (>|=) - let (>>=) = (>>=) - let (<*>) = (<*>) - let (<* ) = (<* ) - let ( *>) = ( *>) - let (<|>) = (<|>) - let (|||) = (|||) - let () = () -end - module U = struct let sep_ = sep @@ -771,6 +825,12 @@ module U = struct let word = map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) + let bool = (string "true" *> return true) <|> (string "false" *> return false) + (*$= & ~printer:(errpp Q.Print.bool) ~cmp:(erreq (=)) + (Ok true) (parse_string U.bool "true") + (Ok false) (parse_string U.bool "false") + *) + let pair ?(start="(") ?(stop=")") ?(sep=",") p1 p2 = skip_white *> string start *> skip_white *> p1 >>= fun x1 -> @@ -791,9 +851,3 @@ module U = struct p3 >>= fun x3 -> string stop *> return (x1,x2,x3) end - -include CCShimsMkLet_.Make(struct - type nonrec 'a t = 'a t - include Infix - let monoid_product = (|||) - end) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index cba724aa..5fa8ea17 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -4,7 +4,11 @@ (** {1 Very Simple Parser Combinators} These combinators can be used to write very simple parsers, for example - to extract data from a line-oriented file. + to extract data from a line-oriented file, or as a replacement to {!Scanf}. + + {2 A few examples} + + {4 Parse a tree} {[ open CCParse;; @@ -16,7 +20,7 @@ let ptree = fix @@ fun self -> skip_space *> - ( (try_ (char '(') *> (pure mk_node <*> self <*> self) <* char ')') + ( (char '(' *> (pure mk_node <*> self <*> self) <* char ')') <|> (U.int >|= mk_leaf) ) ;; @@ -49,8 +53,35 @@ assert (l=l');; ]} + {2 Stability guarantees} + + Some functions are marked "experimental" and are still subject to change. + *) +type position +(** A position in the input. Typically it'll point at the {b beginning} of + an error location. *) + +(** {2 Positions in input} + + @since NEXT_RELEASE *) +module Position : sig + type t = position + + val line : t -> int + (** Line number *) + + val column : t -> int + (** Column number *) + + val line_and_column : t -> int * int + (** Line and column number *) + + val pp : Format.formatter -> t -> unit + (** Unspecified pretty-printed version of the position. *) +end + (** {2 Errors} @since NEXT_RELEASE *) module Error : sig @@ -58,6 +89,9 @@ module Error : sig (** A parse error. @since NEXT_RELEASE *) + val position : t -> position + (** Returns position of the error *) + val line_and_column : t -> int * int (** Line and column numbers of the error position. *) @@ -71,15 +105,16 @@ module Error : sig end type 'a or_error = ('a, Error.t) result -(* TODO: use [('a, error) result] instead, with easy conversion to [('a, string) result] *) +(** ['a or_error] is either [Ok x] for some result [x : 'a], + or an error {!Error.t}. + + See {!stringify_result} and {!Error.to_string} to print the + error message. *) exception ParseError of Error.t (** {2 Input} *) -type position -(* TODO: make a module Position: sig type t val line : t -> int val col : t -> int *) - (** {2 Combinators} *) type 'a t @@ -95,33 +130,20 @@ val return : 'a -> 'a t val pure : 'a -> 'a t (** Synonym to {!return}. *) -val (>|=) : 'a t -> ('a -> 'b) -> 'b t -(** Map. *) - val map : ('a -> 'b) -> 'a t -> 'b t val map2 : ('a -> 'b -> 'c) -> 'a t -> 'b t -> 'c t val map3 : ('a -> 'b -> 'c -> 'd) -> 'a t -> 'b t -> 'c t -> 'd t -val (>>=) : 'a t -> ('a -> 'b t) -> 'b t -(** Monadic bind. - [p >>= f] results in a new parser which behaves as [p] then, - in case of success, applies [f] to the result. *) +val bind : ('a -> 'b t) -> 'a t -> 'b t +(** [bind f p] results in a new parser which behaves as [p] then, + in case of success, applies [f] to the result. + @since NEXT_RELEASE +*) -val (<*>) : ('a -> 'b) t -> 'a t -> 'b t -(** Applicative. *) - -val (<* ) : 'a t -> _ t -> 'a t -(** [a <* b] parses [a] into [x], parses [b] and ignores its result, - and returns [x]. *) - -val ( *>) : _ t -> 'a t -> 'a t -(** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - results of [a] is ignored. *) - -val (|||) : 'a t -> 'b t -> ('a * 'b) t -(** [a ||| b] parses [a], then [b], then returns the pair of their results. +val ap : ('a -> 'b) t -> 'a t -> 'b t +(** Applicative. @since NEXT_RELEASE *) val fail : string -> 'a t @@ -196,19 +218,6 @@ val is_space : char -> bool val is_white : char -> bool (** True on ' ' and '\t' and '\n'. *) -val (<|>) : 'a t -> 'a t -> 'a t -(** [a <|> b] tries to parse [a], and if [a] fails without - consuming any input, backtracks and tries - to parse [b], otherwise it fails as [a]. - See {!try_} to ensure [a] does not consume anything (but it is best - to avoid wrapping large parsers with {!try_}). *) - -val () : 'a t -> string -> 'a t -(** [a msg] behaves like [a], but if [a] fails, - [a ]. For example: - [a <|> b <|> c "expected one of a, b, c"]. *) - val suspend : (unit -> 'a t) -> 'a t (** [suspend f] is the same as [f ()], but evaluates [f ()] only when needed. *) @@ -230,12 +239,12 @@ val optional : _ t -> unit t @since NEXT_RELEASE *) val try_ : 'a t -> 'a t +[@@deprecated "plays no role anymore, just replace [try foo] with [foo]"] (** [try_ p] is just like [p] (it used to play a role in backtracking semantics but no more). @deprecated since NEXT_RELEASE it can just be removed. See {!try_opt} if you want to detect failure. *) -[@@deprecated "plays no role anymore"] val try_opt : 'a t -> 'a option t (** [try_ p] tries to parse using [p], and return [Some x] if [p] @@ -247,12 +256,17 @@ val many_until : until:_ t -> 'a t -> 'a list t the [until] parser successfully returns. If [p] fails before that then [many_until ~until p] fails as well. Typically [until] can be a closing ')' or another termination condition. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t -(** [try_or p1 ~f p2] attempts to parse [x] using [p1], +(** [try_or p1 ~f ~else_:p2] attempts to parse [x] using [p1], and then becomes [f x]. - If [p1] fails, then it becomes [p2]. + If [p1] fails, then it becomes [p2]. This can be useful if [f] is expensive + but only ever works if [p1] matches (e.g. after an opening parenthesis + or some sort of prefix). @since NEXT_RELEASE *) @@ -261,6 +275,16 @@ val or_ : 'a t -> 'a t -> 'a t from the same position. @since NEXT_RELEASE *) +val both : 'a t -> 'b t -> ('a * 'b) t +(** [both a b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) + +val set_error_message : string -> 'a t -> 'a t +(** [set_error_message msg p] behaves like [p], but if [p] fails, + [set_error_message msg p] fails with [msg] instead. + @since NEXT_RELEASE +*) + val many1 : 'a t -> 'a list t (** [many1 p] is like [many p] excepts it fails if the list is empty (i.e. it needs [p] to succeed at least once). *) @@ -271,8 +295,6 @@ val skip : _ t -> unit t val sep : by:_ t -> 'a t -> 'a list t (** [sep ~by p] parses a list of [p] separated by [by]. *) -(* TODO: lookahead? *) - val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t (** Same as {!sep} but stop when [until] parses successfully. @since NEXT_RELEASE *) @@ -280,12 +302,19 @@ val sep_until: until:_ t -> by:_ t -> 'a t -> 'a list t val sep1 : by:_ t -> 'a t -> 'a list t (** [sep1 ~by p] parses a non empty list of [p], separated by [by]. *) +val lookahead : 'a t -> 'a t +(** [lookahead p] behaves like [p], except it doesn't consume any input. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + val line : string t (** Parse a line, '\n' excluded. @since NEXT_RELEASE *) val each_line : 'a t -> 'a list t (** [each_line p] runs [p] on each line of the input. + {b EXPERIMENTAL} @since NEXT_RELEASE *) val fix : ('a t -> 'a t) -> 'a t @@ -299,15 +328,70 @@ val memo : 'a t -> 'a t This can be costly in memory, but improve the run time a lot if there is a lot of backtracking involving [p]. + Do not call {!memo} inside other functions, especially with {!(>>=)}, + {!map}, etc. being so prevalent. Instead the correct way to use it + is in a toplevel definition: + + {[ + let my_expensive_parser = memo (foo *> bar >>= fun i -> …) + ]} + This function is not thread-safe. *) val fix_memo : ('a t -> 'a t) -> 'a t (** Like {!fix}, but the fixpoint is memoized. *) -(** {2 Parse} +(** {2 Infix} *) - Those functions have a label [~p] on the parser, since 0.14. -*) +module Infix : sig + val (>|=) : 'a t -> ('a -> 'b) -> 'b t + (** Alias to {!map}. [p >|= f] parses an item [x] using [p], + and returns [f x]. *) + + val (>>=) : 'a t -> ('a -> 'b t) -> 'b t + (** Alias to {!bind}. + [p >>= f] results in a new parser which behaves as [p] then, + in case of success, applies [f] to the result. *) + + val (<*>) : ('a -> 'b) t -> 'a t -> 'b t + (** Applicative. *) + + val (<* ) : 'a t -> _ t -> 'a t + (** [a <* b] parses [a] into [x], parses [b] and ignores its result, + and returns [x]. *) + + val ( *>) : _ t -> 'a t -> 'a t + (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The + result of [a] is ignored. *) + + val (<|>) : 'a t -> 'a t -> 'a t + (** Alias to {!or_}. + + [a <|> b] tries to parse [a], and if [a] fails without + consuming any input, backtracks and tries + to parse [b], otherwise it fails as [a]. + See {!try_} to ensure [a] does not consume anything (but it is best + to avoid wrapping large parsers with {!try_}). *) + + val () : 'a t -> string -> 'a t + (** [a msg] behaves like [a], but if [a] fails, + [a ]. For example: + [a <|> b <|> c "expected one of a, b, c"]. *) + + val (|||) : 'a t -> 'b t -> ('a * 'b) t + (** Alias to {!both}. + [a ||| b] parses [a], then [b], then returns the pair of their results. + @since NEXT_RELEASE *) + + (** Let operators on OCaml >= 4.08.0, nothing otherwise + @since 2.8 *) + include CCShimsMkLet_.S with type 'a t_let := 'a t +end + +include module type of Infix + +(** {2 Parse input} *) val stringify_result : 'a or_error -> ('a, string) result (** Turn a {!Error.t}-oriented result into a more basic string result. @@ -333,44 +417,6 @@ val parse_file_exn : 'a t -> string -> 'a (** Same as {!parse_file}, but @raise ParseError if it fails. *) -(** {2 Infix} *) - -module Infix : sig - val (>|=) : 'a t -> ('a -> 'b) -> 'b t - (** Map. [p >|= f] parses an item [x] using [p], - and returns [f x]. *) - - val (>>=) : 'a t -> ('a -> 'b t) -> 'b t - (** Monadic bind. - [p >>= f] results in a new parser which behaves as [p] then, - in case of success, applies [f] to the result. *) - - val (<*>) : ('a -> 'b) t -> 'a t -> 'b t - (** Applicative. *) - - val (<* ) : 'a t -> _ t -> 'a t - (** [a <* b] parses [a] into [x], parses [b] and ignores its result, - and returns [x]. *) - - val ( *>) : _ t -> 'a t -> 'a t - (** [a *> b] parses [a], then parses [b] into [x], and returns [x]. The - result of [a] is ignored. *) - - val (<|>) : 'a t -> 'a t -> 'a t - (** [a <|> b] tries to parse [a], and if [a] fails, it backtracks and tries - to parse [b]. - Alias to {!or_} *) - - val (|||) : 'a t -> 'b t -> ('a * 'b) t - (** [a ||| b] parses [a], then [b], then returns the pair of their results. - @since NEXT_RELEASE *) - - val () : 'a t -> string -> 'a t - (** [a msg] behaves like [a], but if [a] fails, - it fails with [msg] instead. Useful as the last choice in a series of - [<|>]: [a <|> b <|> c "expected a|b|c"]. *) - -end (** {2 Utils} @@ -396,7 +442,9 @@ module U : sig val word : string t (** Non empty string of alpha num, start with alpha. *) - (* TODO: boolean literal *) + val bool : bool t + (** Accepts "true" or "false" *) + (* TODO: quoted string *) val pair : ?start:string -> ?stop:string -> ?sep:string -> @@ -409,7 +457,3 @@ module U : sig (** Parse a triple using OCaml syntactic conventions. The default is "(a, b, c)". *) end - -(** Let operators on OCaml >= 4.08.0, nothing otherwise - @since 2.8 *) -include CCShimsMkLet_.S with type 'a t_let := 'a t From c10ae8d84f25212e3cd30b1b9ccc21b844c098e7 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 17:03:11 -0400 Subject: [PATCH 05/24] parse: fix bugs, add tests, add `U.{in_paren,in_paren_opts,option}` --- src/core/CCParse.ml | 69 ++++++++++++++++++++++++++++++++++++++------ src/core/CCParse.mli | 14 +++++++++ 2 files changed, 74 insertions(+), 9 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index 22e6af88..f53f789c 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -119,17 +119,17 @@ module Position = struct type t = position (* actually re-compute line and column from the buffer *) - let compute_line_and_col_ (self:t) : int * int = + let compute_line_and_col_ (s:string) (off:int) : int * int = let i = ref 0 in let continue = ref true in let line = ref 1 in let col = ref 1 in - while !continue && !i < self.pos_offset do - match String.index_from self.pos_buffer !i '\n' with + while !continue && !i < off && !i < String.length s do + match String.index_from s !i '\n' with | exception Not_found -> - col := self.pos_offset - !i; continue := false; - | j when j > self.pos_offset -> - col := self.pos_offset - !i; continue := false; + col := off - !i; continue := false; + | j when j > off -> + col := off - !i; continue := false; | j -> incr line; i := j+1; done; !line, !col @@ -138,7 +138,7 @@ module Position = struct match self.pos_lc with | Some tup -> tup | None -> - let tup = compute_line_and_col_ self in + let tup = compute_line_and_col_ self.pos_buffer self.pos_offset in self.pos_lc <- Some tup; (* save *) tup @@ -339,6 +339,12 @@ let eoi = { else err (mk_error_ st (const_str_ "expected end of input")) } +(*$= & ~printer:(errpp Q.Print.bool) ~cmp:(erreq (=)) + (Ok true) (parse_string (U.bool <* eoi) "true") + (Error "") (parse_string (U.bool <* eoi) "true ") + (Ok true) (parse_string (U.bool <* skip_white <* eoi) "true") +*) + let fail msg : _ t = { run=fun st ~ok:_ ~err -> err (mk_error_ st (const_str_ msg)) @@ -369,7 +375,7 @@ let char c : _ t = { ~ok:(fun st c2 -> if char_equal c c2 then ok st c else ( - let msg() = Printf.sprintf "expected '%c', got '%c'" c (cur st) in + let msg() = Printf.sprintf "expected '%c', got '%c'" c c2 in err (mk_error_ st msg) )) ~err @@ -794,6 +800,49 @@ module U = struct try return (int_of_string s) with Failure _ -> fail "expected an int" + (*$= & ~printer:(errpp Q.Print.int) ~cmp:(erreq (=)) + (Ok 42) (parse_string U.int " 42") + (Ok 2) (parse_string U.int "2") + (Error "") (parse_string U.int "abc") + (Error "") (parse_string U.int "") + *) + + let in_paren (p:'a t) : 'a t = + skip_white *> + (char '(' *> skip_white *> p <* skip_white <* char ')') + + let in_parens_opt (p:'a t) : 'a t = + fix (fun self -> + skip_white *> + try_or + (char '(') + ~f:(fun _ -> skip_white *> self <* skip_white <* char ')') + ~else_:p) + + (*$= & ~printer:(errpp Q.Print.int) ~cmp:(erreq (=)) + (Ok 15) (parse_string (U.in_paren (U.in_paren U.int)) "( ( 15) )") + (Ok 2) (parse_string (U.in_paren U.int) "(2)") + (Error "") (parse_string (U.in_paren U.int) "2") + (Error "") (parse_string (U.in_paren U.int) "") + (Ok 2) (parse_string (U.in_parens_opt U.int) "((((2))))") + (Ok 2) (parse_string (U.in_parens_opt U.int) "2") + (Ok 200) (parse_string (U.in_parens_opt U.int) "( ( 200 ) )") + *) + + let option p = + skip_white *> + try_or + (string "Some") + ~f:(fun _ -> skip_white *> p >|= fun x -> Some x) + ~else_:(string "None" *> return None) + + (*$= & ~printer:(errpp Q.Print.(option int)) ~cmp:(erreq (=)) + (Ok (Some 12)) (parse_string U.(option int) " Some 12") + (Ok None) (parse_string U.(option int) " None") + (Ok (Some 0)) (parse_string U.(option int) "Some 0") + (Ok (Some 0)) (parse_string U.(in_parens_opt @@ option int) "(( Some 0) )") + *) + let hexa_int = (exact "0x" <|> return "") *> begin @@ -825,7 +874,9 @@ module U = struct let word = map2 prepend_str (char_if is_alpha) (chars_if is_alpha_num) - let bool = (string "true" *> return true) <|> (string "false" *> return false) + let bool = + skip_white *> + ((string "true" *> return true) <|> (string "false" *> return false)) (*$= & ~printer:(errpp Q.Print.bool) ~cmp:(erreq (=)) (Ok true) (parse_string U.bool "true") (Ok false) (parse_string U.bool "false") diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 5fa8ea17..0438bdd2 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -434,6 +434,20 @@ module U : sig val int : int t (** Parse an int in decimal representation. *) + val in_paren : 'a t -> 'a t + (** [in_paren p] parses an opening "(",[p] , and then ")". + @since NEXT_RELEASE *) + + val in_parens_opt : 'a t -> 'a t + (** [in_parens_opt p] parses [p] in an arbitrary number of nested + parenthesis (possibly 0). + @since NEXT_RELEASE *) + + val option : 'a t -> 'a option t + (** [option p] parses "Some " into [Some x] if [p] parses "" into [x], + and parses "None" into [None]. + @since NEXT_RELEASE *) + val hexa_int : int t (** Parse an int int hexadecimal format. Accepts an optional [0x] prefix, and ignores capitalization. From 78a530cceef74df00ccc3d66f2c09cd4da241e4e Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 17:15:51 -0400 Subject: [PATCH 06/24] feat(ord): add `poly`, deprecate `compare` --- src/core/CCOrd.mli | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/core/CCOrd.mli b/src/core/CCOrd.mli index 7b121a49..e2f9454d 100644 --- a/src/core/CCOrd.mli +++ b/src/core/CCOrd.mli @@ -6,11 +6,21 @@ type 'a t = 'a -> 'a -> int (** Comparison (total ordering) between two elements, that returns an int. *) +val poly : 'a t +(** Polymorphic "magic" comparison. Use with care, as it will fail on + some types. + @since NEXT_RELEASE *) + val compare : 'a t -(** Polymorphic "magic" comparison. *) +[@@deprecated "use CCOrd.poly instead, this name is too general"] +(** Polymorphic "magic" comparison. + @deprecated since NEXT_RELEASE in favor of {!poly}. The reason is that + [compare] is easily shadowed, can shadow other comparators, and is just + generally not very descriptive. *) val opp : 'a t -> 'a t -(** Opposite order. *) +(** Opposite order. For example, [opp a b < 0] iff [opp b a > 0]. + This can be used to sort values in the opposite order, among other things. *) val equiv : int -> int -> bool (** Returns [true] iff the two comparison results are the same. *) From 7081a411c8f8c614f579b9957b2021b721c02586 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 17:16:26 -0400 Subject: [PATCH 07/24] small doc change --- src/core/CCParse.ml | 1 - 1 file changed, 1 deletion(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index f53f789c..12bc43c3 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -9,7 +9,6 @@ open CCShims_ type tree = L of int | N of tree * tree end open T - open Result let mk_leaf x = L x let mk_node x y = N(x,y) From 294fce863477f54fa2ec3f06b8aba70c92bc1b18 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 17:39:09 -0400 Subject: [PATCH 08/24] fixup! feat(ord): add `poly`, deprecate `compare` --- src/core/CCOrd.ml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/CCOrd.ml b/src/core/CCOrd.ml index 413b2453..9a6b8513 100644 --- a/src/core/CCOrd.ml +++ b/src/core/CCOrd.ml @@ -8,6 +8,7 @@ open CCShims_ type 'a t = 'a -> 'a -> int (** Comparison (total ordering) between two elements, that returns an int *) +let poly = Stdlib.compare let compare = Stdlib.compare let opp f x y = - (f x y) From c63a2b7b37b6db829091557d79b3f28d286518ed Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 18:49:36 -0400 Subject: [PATCH 09/24] fix tests: use dune's `locks` with absolute path --- qtest/dune | 6 +++--- src/core/tests/dune | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/qtest/dune b/qtest/dune index e3b5e712..5775f212 100644 --- a/qtest/dune +++ b/qtest/dune @@ -19,7 +19,7 @@ (alias (name runtest) - (locks ctest) + (locks /ctest) (package containers) (action (run ./run_qtest.exe))) @@ -39,7 +39,7 @@ (alias (name runtest) (package containers-data) - (locks ctest) + (locks /ctest) (action (run ./run_qtest_data.exe))) (rule @@ -57,6 +57,6 @@ (alias (name runtest) - (locks ctest) + (locks /ctest) (package containers-thread) (action (run ./run_qtest_thread.exe))) diff --git a/src/core/tests/dune b/src/core/tests/dune index ef6300eb..bca775fc 100644 --- a/src/core/tests/dune +++ b/src/core/tests/dune @@ -20,13 +20,13 @@ (alias (name runtest) - (locks ctest) + (locks /ctest) (package containers) (action (run ./test_random.exe))) ; what matters is that it compiles (alias (name runtest) - (locks ctest) + (locks /ctest) (package containers) (action (run ./check_labelled_mods.exe))) From 88fe234a4c82d74866fde282007fd441872ea801 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 18:49:55 -0400 Subject: [PATCH 10/24] add `CCParse.{char_fold, chars_fold_map}` useful for non-trivial lexing --- src/core/CCParse.ml | 89 +++++++++++++++++++++++++++++++++++++++++++- src/core/CCParse.mli | 53 ++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 2 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index 12bc43c3..aaaae7f6 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -160,11 +160,11 @@ module Error = struct let msg self = self.msg() let to_string self = let line,col = line_and_column self in - Printf.sprintf "at line %d, char %d:\n%s" line col (self.msg()) + Printf.sprintf "at line %d, char %d: %s" line col (self.msg()) let pp out self = let line,col = line_and_column self in - Format.fprintf out "at line %d, char %d:@ %s" line col (self.msg()) + Format.fprintf out "@[at line %d, char %d:@ %s@]" line col (self.msg()) end type 'a or_error = ('a, Error.t) result @@ -429,6 +429,65 @@ let chars1_if ?descr p = { ~err } +exception Fold_fail of state * string + +let chars_fold ~f acc0 = { + run=fun st ~ok ~err -> + let i0 = st.i in + let i = ref i0 in + let acc = ref acc0 in + let continue = ref true in + try + while !continue do + let st = {st with i = !i} in + if is_done st then ( + continue := false; + ) else ( + let c = cur st in + match f !acc c with + | `Continue acc' -> + incr i; + acc := acc' + | `Stop -> continue := false; + | `Consume_and_stop -> incr i; continue := false + | `Fail msg -> raise (Fold_fail (st,msg)) + ) + done; + ok {st with i= !i} !acc + with Fold_fail (st,msg) -> err (mk_error_ st (const_str_ msg)) +} + +let chars_fold_map ~f acc0 = { + run=fun st ~ok ~err -> + let i0 = st.i in + let i = ref i0 in + let acc = ref acc0 in + let continue = ref true in + let buf = Buffer.create 16 in + try + while !continue do + let st = {st with i = !i} in + if is_done st then ( + continue := false; + ) else ( + let c = cur st in + match f !acc c with + | `Continue acc' -> + incr i; + acc := acc' + | `Yield (acc', c') -> + incr i; + acc := acc'; + Buffer.add_char buf c'; + | `Stop -> continue := false; + | `Consume_and_stop -> incr i; continue := false + | `Fail msg -> raise (Fold_fail (st,msg)) + ) + done; + ok {st with i= !i} (!acc, Buffer.contents buf) + with Fold_fail (st,msg) -> err (mk_error_ st (const_str_ msg)) +} + let skip_chars p : _ t = let rec self = { run=fun st ~ok ~err -> @@ -901,3 +960,29 @@ module U = struct p3 >>= fun x3 -> string stop *> return (x1,x2,x3) end + +module Debug_ = struct + let trace_fail name p = { + run=fun st ~ok ~err -> + p.run st ~ok + ~err:(fun e -> + Printf.eprintf "trace %s: fail with %s\n%!" name (Error.to_string e); + err e) + } + + let trace_ ~both name ~print p = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st x -> + Printf.eprintf "trace %s: parsed %s\n%!" name (print x); + ok st x) + ~err:(fun e -> + if both then ( + Printf.eprintf "trace %s: fail with %s\n%!" name (Error.to_string e); + ); + err e) + } + + let trace_success name ~print p = trace_ ~both:false name ~print p + let trace_success_or_fail name ~print p = trace_ ~both:true name ~print p +end diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 0438bdd2..e20b4914 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -185,6 +185,43 @@ val chars1_if : ?descr:string -> (char -> bool) -> string t (** Like {!chars_if}, but only non-empty strings. @param descr describes what kind of character was expected *) +val chars_fold : + f:('acc -> char -> + [`Continue of 'acc | `Consume_and_stop | `Stop | `Fail of string]) -> + 'acc -> + 'acc t +(** [chars_fold f acc0] folds over characters of the input. + Each char [c] is passed, along with the current accumulator, to [f]; + [f] can either: + + - stop, by returning [`Stop]. In this case the current accumulator + is returned, and [c] is not consumed. + - consume char and stop, by returning [`Consume_and_stop]. + - fail, by returning [`Fail msg]. In this case the parser fails + with the given message. + - continue, by returning [`Continue acc]. The parser continues to the + next char with the new accumulator. + + This is a generalization of of {!chars_if} that allows one to transform + characters on the fly, skip some, handle escape sequences, etc. + + @since NEXT_RELEASE *) + +val chars_fold_map : + f:('acc -> char -> + [`Continue of 'acc | `Yield of 'acc * char + | `Consume_and_stop | `Stop | `Fail of string]) -> + 'acc -> + ('acc * string) t +(** Same as {!char_fold} but with the following differences: + + - returns a string along with the accumulator. The string is built from + characters returned by [`Yield]. + - new case [`Yield (acc, c)] adds [c] to the returned string + and continues parsing with [acc]. + + @since NEXT_RELEASE *) + val endline : char t (** Parse '\n'. *) @@ -471,3 +508,19 @@ module U : sig (** Parse a triple using OCaml syntactic conventions. The default is "(a, b, c)". *) end + +(** Debugging utils. + {b EXPERIMENTAL} *) +module Debug_ : sig + val trace_fail : string -> 'a t -> 'a t + (** [trace_fail name p] behaves like [p], but prints the error message of [p] + on stderr whenever [p] fails. + @param name used as a prefix of all trace messages. *) + + val trace_success : string -> print:('a -> string) -> 'a t -> 'a t + (** [trace_success name ~print p] behaves like [p], but + prints successful runs of [p] using [print]. *) + + val trace_success_or_fail : string -> print:('a -> string) -> 'a t -> 'a t + (** Trace both error or success *) +end From 7bdc3cff24fa6b0e7fb52f59a45f0d250a14a8ae Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 18:50:28 -0400 Subject: [PATCH 11/24] add example `CCParse`-based Sexpr parser, and a test --- examples/ccparse_sexp.ml | 66 ++++++++++++++++++++++++++++++ examples/dune | 20 +++++++-- examples/test_data/benchpress.sexp | 13 ++++++ 3 files changed, 96 insertions(+), 3 deletions(-) create mode 100644 examples/ccparse_sexp.ml create mode 100644 examples/test_data/benchpress.sexp diff --git a/examples/ccparse_sexp.ml b/examples/ccparse_sexp.ml new file mode 100644 index 00000000..369542ab --- /dev/null +++ b/examples/ccparse_sexp.ml @@ -0,0 +1,66 @@ +open CCParse + +type sexp = Atom of string | List of sexp list + +let rec pp_sexpr out (s:sexp) : unit = match s with + | Atom s -> Format.fprintf out "%S" s + | List l -> + Format.fprintf out "(@["; + List.iteri (fun i s -> if i>0 then Format.fprintf out "@ "; pp_sexpr out s) l; + Format.fprintf out "@])" + +let str_of_sexp = CCFormat.to_string pp_sexpr + +let skip_white_and_comments = + fix @@ fun self -> + skip_white *> + ( try_or (char ';') + ~f:(fun _ -> skip_chars (function '\n' -> false | _ -> true) *> self) + ~else_:(return ()) + ) + +let atom = + chars_fold_map `Start + ~f:(fun acc c -> + match acc, c with + | `Start, '"' -> `Continue `In_quote + | `Start, (' ' | '\t' | '\n' | '(' | ')' | ';') -> `Fail "atom" + | `Normal, (' ' | '\t' | '\n' | '(' | ')' | ';') -> `Stop + | `Done, _ -> `Stop + | `In_quote, '"' -> `Continue `Done (* consume *) + | `In_quote, '\\' -> `Continue `Escape + | `In_quote, c -> `Yield (`In_quote, c) + | `Escape, 'n' -> `Yield (`In_quote, '\n') + | `Escape, 't' -> `Yield (`In_quote, '\t') + | `Escape, '"' -> `Yield (`In_quote, '"') + | `Escape, '\\' -> `Yield (`In_quote, '\\') + | `Escape, c -> `Fail (Printf.sprintf "unknown escape code \\%c" c) + | (`Start | `Normal), c -> `Yield (`Normal, c) + | _ -> `Fail "invalid atom" + ) + >>= function + | `In_quote, _ -> fail "unclosed \"" + | `Escape, _ -> fail "unfinished escape sequence" + | _, "" -> fail "expected non-empty atom" + | _, s -> return (Atom s) + +let psexp = + fix @@ fun self -> + skip_white_and_comments *> + try_or (char '(') + ~f:(fun _ -> + (sep ~by:skip_white_and_comments self + <* skip_white_and_comments <* char ')') >|= fun l -> List l) + ~else_:atom + +let psexp_l = + many_until ~until:(skip_white_and_comments *> eoi) psexp + +let () = + let s = CCIO.File.read_exn Sys.argv.(1) in + match parse_string psexp_l s with + | Ok l -> + Format.printf "parsed:@."; + List.iter (Format.printf "%a@." pp_sexpr) l + | Error e -> + Format.printf "parse error: %s@." e; exit 1 diff --git a/examples/dune b/examples/dune index 33361d56..554c2165 100644 --- a/examples/dune +++ b/examples/dune @@ -1,8 +1,22 @@ -(executable - (name id_sexp) +(executables + (names id_sexp ccparse_sexp) (libraries containers) - (modules id_sexp) + ;(modules id_sexp) (flags :standard -warn-error -a+8 -safe-string -color always) (ocamlopt_flags :standard -O3 -color always -unbox-closures -unbox-closures-factor 20)) + +(alias + (name runtest) + (locks /ctest) + (action + (ignore-stdout + (run ./id_sexp.exe test_data/benchpress.sexp)))) + +(alias + (name runtest) + (locks /ctest) + (action + (ignore-stdout + (run ./ccparse_sexp.exe test_data/benchpress.sexp)))) diff --git a/examples/test_data/benchpress.sexp b/examples/test_data/benchpress.sexp new file mode 100644 index 00000000..0b48e773 --- /dev/null +++ b/examples/test_data/benchpress.sexp @@ -0,0 +1,13 @@ + +(prover + (name msat) + (synopsis "msat for pure sat problems") + (version "git:.") + (sat "^Sat") + (unsat "^Unsat") + (cmd "$cur_dir/../msat.exe -time $timeout $file")) + +(dir + (path $cur_dir) + (pattern ".*\\.cnf") + (expect (const unknown))) From 1517f64f55a3611720ae4e27e2f3a4e02a2bdaf7 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 22:42:20 -0400 Subject: [PATCH 12/24] CCParse: add `slice` and the ability to recurse on them the idea is that it's often convenient to split the input into smaller part (e.g. lines), or do a first pass of parsing that just returns a slice of the input; and then later to use another parser on that slice to extract the actual data. The new notion of `slice` allows that, while preserving locations wrt the original input. --- src/core/CCParse.ml | 253 +++++++++++++++++++++++++++++++++--------- src/core/CCParse.mli | 258 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 427 insertions(+), 84 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index aaaae7f6..2fbdf451 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -167,7 +167,7 @@ module Error = struct Format.fprintf out "@[at line %d, char %d:@ %s@]" line col (self.msg()) end -type 'a or_error = ('a, Error.t) result +type +'a or_error = ('a, Error.t) result module Memo_tbl = Hashtbl.Make(struct type t = int * int (* id of parser, position *) @@ -186,7 +186,8 @@ end (** Purely functional state passed around *) type state = { str: string; (* the input *) - i: int; (* offset in [input.str] *) + i: int; (* offset in [str] *) + j: int; (* end pointer in [str], excluded. [len = j-i] *) memo : Memo_state.t option ref; (* Memoization table, if any *) } @@ -207,11 +208,12 @@ let state_of_string str = let s = { str; i=0; + j=String.length str; memo=ref None; } in s -let[@inline] is_done st = st.i >= String.length st.str +let[@inline] is_done st = st.i >= st.j let[@inline] cur st = st.str.[st.i] let pos_of_st_ st : position = {pos_buffer=st.str; pos_offset=st.i; pos_lc=None} @@ -328,7 +330,7 @@ let map2 f x y = pure f <*> x <*> y let map3 f x y z = pure f <*> x <*> y <*> z let junk_ (st:state) : state = - assert (st.i < String.length st.str); + assert (st.i < st.j); {st with i=st.i + 1} let eoi = { @@ -344,11 +346,61 @@ let eoi = { (Ok true) (parse_string (U.bool <* skip_white <* eoi) "true") *) +let with_pos p : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun st' x -> + ok st' (x, pos_of_st_ st)) + ~err +} + +(* a slice is just a state, which makes {!recurse} quite easy. *) +type slice = state + +module Slice = struct + type t = slice + let length sl = sl.j - sl.i + let is_empty sl = sl.i = sl.j + let to_string sl = String.sub sl.str sl.i (length sl) +end + +let recurse slice p : _ t = { + run=fun _st ~ok ~err -> + (* make sure these states are related. all slices share the + same reference as the initial state they derive from. *) + assert CCShims_.Stdlib.(_st.memo == slice.memo); + p.run slice ~ok ~err +} + +let all = { + run=fun st ~ok ~err:_ -> + if is_done st then ok st st + else ( + let st_done = {st with i=st.j} in + ok st_done st + ) +} + +let all_str = all >|= Slice.to_string + +(*$= & ~printer:(errpp Q.Print.string) ~cmp:(erreq (=)) + (Ok "abcd") (parse_string all_str "abcd") + (Ok "cd") (parse_string (string "ab" *> all_str) "ab") +*) + +(*$= & ~printer:(errpp Q.Print.(pair string string)) ~cmp:(erreq (=)) + (Ok ("foobar", "")) (parse_string (both all_str all_str) "foobar") + *) + let fail msg : _ t = { run=fun st ~ok:_ ~err -> err (mk_error_ st (const_str_ msg)) } let failf msg = Printf.ksprintf fail msg +let fail_lazy msg = { + run=fun st ~ok:_ ~err -> + err (mk_error_ st msg) +} let parsing what p = { run=fun st ~ok ~err -> @@ -360,9 +412,10 @@ let parsing what p = { err {e with Error.msg}) } -let nop = { +let empty = { run=fun st ~ok ~err:_ -> ok st (); } +let nop = empty let any_char = { run=fun st ~ok ~err -> consume_ st ~ok ~err @@ -398,19 +451,35 @@ let char_if ?descr p = { ~err } -let chars_if p = { +let take_if p : slice t = { run=fun st ~ok ~err:_ -> - let i0 = st.i in - let i = ref i0 in + let i = ref st.i in while let st = {st with i = !i} in not (is_done st) && p (cur st) do incr i; done; - ok {st with i = !i} (String.sub st.str i0 (!i - i0)) + ok {st with i = !i} {st with j= !i} } +let take1_if ?descr p = + take_if p >>= fun sl -> + if Slice.is_empty sl then ( + let msg() = + let what = match descr with + | None -> "" + | Some d -> Printf.sprintf " for %s" d + in + Printf.sprintf "expected non-empty sequence of chars%s" what + in + fail_lazy msg + ) else ( + return sl + ) + +let chars_if p = take_if p >|= Slice.to_string + let chars1_if ?descr p = { run=fun st ~ok ~err -> (chars_if p).run st @@ -453,11 +522,11 @@ let chars_fold ~f acc0 = { | `Fail msg -> raise (Fold_fail (st,msg)) ) done; - ok {st with i= !i} !acc + ok {st with i= !i} (!acc, {st with j= !i}) with Fold_fail (st,msg) -> err (mk_error_ st (const_str_ msg)) } -let chars_fold_map ~f acc0 = { +let chars_fold_transduce ~f acc0 = { run=fun st ~ok ~err -> let i0 = st.i in let i = ref i0 in @@ -524,6 +593,22 @@ let try_or p1 ~f ~else_:p2 = { ~err:(fun _ -> p2.run st ~ok ~err) } +let try_or_l ?(msg="try_or_l ran out of options") ?else_ l : _ t = { + run=fun st ~ok ~err -> + let rec loop = function + | (test, p) :: tl -> + test.run st + ~ok:(fun _ _ -> p.run st ~ok ~err) (* commit *) + ~err:(fun _ -> loop tl) + | [] -> + begin match else_ with + | None -> err (mk_error_ st (const_str_ msg)) + | Some p -> p.run st ~ok ~err + end + in + loop l +} + let suspend f = { run=fun st ~ok ~err -> let p = f () in @@ -531,12 +616,12 @@ let suspend f = { } (* read [len] chars at once *) -let any_chars len : _ t = { +let take len : slice t = { run=fun st ~ok ~err -> - if st.i + len <= String.length st.str then ( - let s = String.sub st.str st.i len in + if st.i + len <= st.j then ( + let slice = {st with j = st.i + len} in let st = {st with i = st.i + len} in - ok st s + ok st slice ) else ( let msg() = Printf.sprintf "expected to be able to consume %d chars" len @@ -545,6 +630,8 @@ let any_chars len : _ t = { ) } +let any_chars len : _ t = take len >|= Slice.to_string + let exact s = { run=fun st ~ok ~err -> (* parse a string of length [String.length s] and compare with [s] *) @@ -697,58 +784,116 @@ let lookahead p : _ t = { ~err } +let lookahead_ignore p : _ t = { + run=fun st ~ok ~err -> + p.run st + ~ok:(fun _st _x -> ok st ()) + ~err +} + (*$= & ~printer:(errpp Q.Print.(string)) (Ok "abc") (parse_string (lookahead (string "ab") *> (string "abc")) "abcd") *) -let line : _ t = { - run=fun st ~ok ~err -> - if is_done st then err (mk_error_ st (const_str_ "expected a line, not EOI")) - else ( - match String.index_from st.str st.i '\n' with +(*$= + (Ok "1234") (parse_string line_str "1234\nyolo") + (Ok ("1234", "yolo")) (parse_string (line_str ||| line_str) "1234\nyolo\nswag") +*) + +let split_1 ~on_char : _ t = { + run=fun st ~ok ~err:_ -> + if st.i >= st.j then ( + ok st (st, None) + ) else ( + match String.index_from st.str st.i on_char with | j -> - let s = String.sub st.str st.i (j - st.i) in - ok {st with i=j+1} s + let x = {st with j} in + let y = {st with i=min st.j (j+1)} in + let st_done = {st with i=st.j} in (* empty *) + ok st_done (x, Some y) | exception Not_found -> - err (mk_error_ st (const_str_ "unterminated line")) + let st_done = {st with i=st.j} in (* empty *) + ok st_done (st, None) ) } -(*$= - (Ok "1234") (parse_string line "1234\nyolo") - (Ok ("1234", "yolo")) (parse_string (line ||| line) "1234\nyolo\nswag") +let split_list_at_most ~on_char n : slice list t = + let rec loop acc n = + if n <= 0 then return (List.rev acc) + else ( + try_or + eoi + ~f:(fun _ -> return (List.rev acc)) + ~else_:(parse_1 acc n) + ) + and parse_1 acc n = + split_1 ~on_char >>= fun (sl1, rest) -> + let acc = sl1 :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc (n-1)) + in + loop [] n + +(*$= & ~printer:(errpp Q.Print.(list string)) ~cmp:(erreq (=)) + (Ok ["a";"b";"c";"d,e,f"]) \ + (parse_string (split_list_at_most ~on_char:',' 3 >|= List.map Slice.to_string) "a,b,c,d,e,f") + (Ok ["a";"bc"]) \ + (parse_string (split_list_at_most ~on_char:',' 3 >|= List.map Slice.to_string) "a,bc") *) -(* parse a string [s] using [p_sub], then parse [s] using [p]. - The result is that of parsing [s] using [p], but the state is - the one after using [p_sub], and errors are translated back into the context - of [p_sub]. - This can be useful for example in [p_sub line some_line_parser]. *) -let parse_sub_ p_sub p : _ t = { - run=fun st0 ~ok ~err -> - let p = p <* eoi in (* make sure [p] reads all *) - p_sub.run st0 - ~ok:(fun st1 s -> - p.run (state_of_string s) - ~ok:(fun _ r -> ok st1 r) - ~err:(fun e -> - let pos = e.pos in - let pos = { - pos_buffer=pos.pos_buffer; - pos_offset=pos.pos_offset + st0.i; - pos_lc=None; - } in - err {e with pos})) - ~err -} +let split_list ~on_char : _ t = + split_list_at_most ~on_char max_int + +let split_2 ~on_char : _ t = + split_list_at_most ~on_char 2 >>= function + | [a; b] -> return (a,b) + | _ -> fail "split_2: expected 2 fields exactly" + +let split_3 ~on_char : _ t = + split_list_at_most ~on_char 3 >>= function + | [a; b; c] -> return (a,b,c) + | _ -> fail "split_3: expected 3 fields exactly" + +let split_4 ~on_char : _ t = + split_list_at_most ~on_char 4 >>= function + | [a; b; c; d] -> return (a,b,c,d) + | _ -> fail "split_4: expected 4 fields exactly" + +let split_list ~on_char : slice list t = + let rec loop acc = + try_or + eoi + ~f:(fun _ -> return (List.rev acc)) + ~else_:(parse_1 acc) + and parse_1 acc = + split_1 ~on_char >>= fun (sl1, rest) -> + let acc = sl1 :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc) + in + loop [] + +let each_split ~on_char p : 'a list t = + let rec loop acc = + split_1 ~on_char >>= fun (sl1, rest) -> + (* parse [sl1] with [p] *) + recurse sl1 p >>= fun x -> + let acc = x :: acc in + match rest with + | None -> return (List.rev acc) + | Some rest -> recurse rest (loop acc) + in + loop [] + +let line : slice t = + split_1 ~on_char:'\n' >|= fst + +let line_str = line >|= Slice.to_string let each_line p : _ t = - fix - (fun self -> - try_or eoi - ~f:(fun _ -> pure []) - (parse_sub_ line p >>= fun x -> - self >|= fun tl -> x :: tl)) + each_split ~on_char:'\n' p (*$= & ~printer:(errpp Q.Print.(list @@ list int)) (Ok ([[1;1];[2;2];[3;3]])) \ diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index e20b4914..c3a035c6 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -8,6 +8,8 @@ {2 A few examples} + Some more advanced example(s) can be found in the [/examples] directory. + {4 Parse a tree} {[ @@ -104,7 +106,7 @@ module Error : sig (** Pretty prints the error *) end -type 'a or_error = ('a, Error.t) result +type +'a or_error = ('a, Error.t) result (** ['a or_error] is either [Ok x] for some result [x : 'a], or an error {!Error.t}. @@ -146,21 +148,45 @@ val ap : ('a -> 'b) t -> 'a t -> 'b t (** Applicative. @since NEXT_RELEASE *) +val eoi : unit t +(** Expect the end of input, fails otherwise. *) + +val nop : unit t +(** Succeed with [()]. *) + +val empty : unit t +(** Succeed with [()], same as {!nop}. + @since NEXT_RELEASE *) + val fail : string -> 'a t (** [fail msg] fails with the given message. It can trigger a backtrack. *) val failf: ('a, unit, string, 'b t) format4 -> 'a (** [Format.sprintf] version of {!fail}. *) +val fail_lazy : (unit -> string) -> 'a t +(** Like {!fail}, but only produce an error message on demand. + @since NEXT_RELEASE *) + val parsing : string -> 'a t -> 'a t (** [parsing s p] behaves the same as [p], with the information that - we are parsing [s], if [p] fails. *) + we are parsing [s], if [p] fails. + The message [s] is added to the error, it does not replace it, + not does the location change (the error still points to + the same location as in [p]). *) -val eoi : unit t -(** Expect the end of input, fails otherwise. *) +val set_error_message : string -> 'a t -> 'a t +(** [set_error_message msg p] behaves like [p], but if [p] fails, + [set_error_message msg p] fails with [msg] instead and at the current + position. The internal error message of [p] is just discarded. + @since NEXT_RELEASE *) -val nop : unit t -(** Succeed with [()]. *) +val with_pos : 'a t -> ('a * position) t +(** [with_pos p] behaves like [p], but returns the (starting) position + along with [p]'s result. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) val any_char : char t (** [any_char] parses any character. @@ -169,27 +195,52 @@ val any_char : char t val any_chars : int -> string t (** [any_chars len] parses exactly [len] characters from the input. + Fails if the input doesn't contain at least [len] chars. @since NEXT_RELEASE *) val char : char -> char t (** [char c] parses the character [c] and nothing else. *) -val char_if : ?descr:string -> (char -> bool) -> char t -(** [char_if f] parses a character [c] if [f c = true]. - @param descr describes what kind of character was expected *) +type slice +(** A slice of the input, as returned by some combinators such + as {!split_1} or {split_n}. -val chars_if : (char -> bool) -> string t -(** [chars_if f] parses a string of chars that satisfy [f]. *) + {b EXPERIMENTAL} + @since NEXT_RELEASE *) -val chars1_if : ?descr:string -> (char -> bool) -> string t -(** Like {!chars_if}, but only non-empty strings. - @param descr describes what kind of character was expected *) +(** Functions on slices. + @since NEXT_RELEASE *) +module Slice : sig + type t = slice + + val is_empty : t -> bool + (** Is the slice empty? *) + + val length : t -> int + (** Length of the slice *) + + val to_string : t -> string + (** Convert the slice into a string. + Linear time and memory in [length slice] *) +end + +val recurse : slice -> 'a t -> 'a t +(** [recurse slice p] parses the [slice] + (most likely obtained via another combinator, such as {!split_1} + or {!split_n}), using [p]. + + The slice contains a position which is used to relocate error + messages to their position in the whole input, not just relative to + the slice. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) val chars_fold : f:('acc -> char -> [`Continue of 'acc | `Consume_and_stop | `Stop | `Fail of string]) -> 'acc -> - 'acc t + ('acc * slice) t (** [chars_fold f acc0] folds over characters of the input. Each char [c] is passed, along with the current accumulator, to [f]; [f] can either: @@ -204,24 +255,54 @@ val chars_fold : This is a generalization of of {!chars_if} that allows one to transform characters on the fly, skip some, handle escape sequences, etc. + It can also be useful as a base component for a lexer. @since NEXT_RELEASE *) -val chars_fold_map : +val chars_fold_transduce : f:('acc -> char -> - [`Continue of 'acc | `Yield of 'acc * char + [ `Continue of 'acc | `Yield of 'acc * char | `Consume_and_stop | `Stop | `Fail of string]) -> 'acc -> ('acc * string) t (** Same as {!char_fold} but with the following differences: - - returns a string along with the accumulator. The string is built from - characters returned by [`Yield]. + - returns a string along with the accumulator, rather than the slice + of all the characters accepted by [`Continue _]. + The string is built from characters returned by [`Yield]. - new case [`Yield (acc, c)] adds [c] to the returned string and continues parsing with [acc]. @since NEXT_RELEASE *) +val take : int -> slice t +(** [slice_of_len len] parses exactly [len] characters from the input. + Fails if the input doesn't contain at least [len] chars. + @since NEXT_RELEASE *) + +val take_if : (char -> bool) -> slice t +(** [take_if f] takes characters as long as they satisfy the predicate [f]. + @since NEXT_RELEASE *) + +val take1_if : ?descr:string -> (char -> bool) -> slice t +(** [take1_if f] takes characters as long as they satisfy the predicate [f]. + Fails if no character satisfies [f]. + @since NEXT_RELEASE *) + +val char_if : ?descr:string -> (char -> bool) -> char t +(** [char_if f] parses a character [c] if [f c = true]. + Fails if the next char does not satisfy [f]. + @param descr describes what kind of character was expected *) + +val chars_if : (char -> bool) -> string t +(** [chars_if f] parses a string of chars that satisfy [f]. + Cannot fail. *) + +val chars1_if : ?descr:string -> (char -> bool) -> string t +(** Like {!chars_if}, but only non-empty strings. + Fails if the string is empty. + @param descr describes what kind of character was expected *) + val endline : char t (** Parse '\n'. *) @@ -272,7 +353,9 @@ val many : 'a t -> 'a list t val optional : _ t -> unit t (** [optional p] tries to parse [p], and return [()] whether it - succeeded or failed. Cannot fail. + succeeded or failed. Cannot fail itself. + It consumes input if [p] succeeded (as much as [p] consumed), but + consumes not input if [p] failed. @since NEXT_RELEASE *) val try_ : 'a t -> 'a t @@ -307,6 +390,27 @@ val try_or : 'a t -> f:('a -> 'b t) -> else_:'b t -> 'b t @since NEXT_RELEASE *) +val try_or_l : + ?msg:string -> + ?else_:'a t -> + (unit t * 'a t) list -> + 'a t +(** [try_or_l ?else_ l] tries each pair [(test, p)] in order. + If the n-th [test] succeeds, then [try_or_l l] behaves like n-th [p], + whether [p] fails or not. + If they all fail, and [else_] is defined, then it behaves like [else_]. + If all fail, and [else_] is [None], then it fails as well. + + This is a performance optimization compared to {!(<|>)}. We commit to a + branch if the test succeeds, without backtracking at all. + + See {!lookahead_ignore} for a convenient way of writing the test conditions. + + @param msg error message if all options fail + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + val or_ : 'a t -> 'a t -> 'a t (** [or_ p1 p2] tries to parse [p1], and if it fails, tries [p2] from the same position. @@ -316,12 +420,6 @@ val both : 'a t -> 'b t -> ('a * 'b) t (** [both a b] parses [a], then [b], then returns the pair of their results. @since NEXT_RELEASE *) -val set_error_message : string -> 'a t -> 'a t -(** [set_error_message msg p] behaves like [p], but if [p] fails, - [set_error_message msg p] fails with [msg] instead. - @since NEXT_RELEASE -*) - val many1 : 'a t -> 'a list t (** [many1 p] is like [many p] excepts it fails if the list is empty (i.e. it needs [p] to succeed at least once). *) @@ -345,8 +443,26 @@ val lookahead : 'a t -> 'a t {b EXPERIMENTAL} @since NEXT_RELEASE *) -val line : string t -(** Parse a line, '\n' excluded. +val lookahead_ignore : 'a t -> unit t +(** [lookahead_ignore p] tries to parse input with [p], + and succeeds if [p] succeeds. However it doesn't consume any input + and returns [()], so in effect its only use-case is to detect + whether [p] succeeds, e.g. in {!cond}. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val fix : ('a t -> 'a t) -> 'a t +(** Fixpoint combinator. *) + +val line : slice t +(** Parse a line, ['\n'] excluded, and position the cursor after the ['\n']. + @since NEXT_RELEASE *) + +val line_str : string t +(** [line_str] is [line >|= Slice.to_string]. + It parses the next line and turns the slice into a string. + The state points to after the ['\n'] character. @since NEXT_RELEASE *) val each_line : 'a t -> 'a list t @@ -354,8 +470,90 @@ val each_line : 'a t -> 'a list t {b EXPERIMENTAL} @since NEXT_RELEASE *) -val fix : ('a t -> 'a t) -> 'a t -(** Fixpoint combinator. *) +val split_1 : on_char:char -> (slice * slice option) t +(** [split_1 ~on_char] looks for [on_char] in the input, and returns a + pair [sl1, sl2], where: + + - [sl1] is the slice of the input the precedes the first occurrence + of [on_char], or the whole input if [on_char] cannot be found. + - [sl2] is the slice that comes after [on_char], + or [None] if [on_char] couldn't be found. + + The parser is now positioned at the end of the input. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_list : on_char:char -> slice list t +(** [split_n ~on_char] splits the input on all occurrences of [on_char], + returning a list of slices. + + A useful specialization of this is {!each_line}, which is + basically [split_n ~on_char:'\n' p]. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_list_at_most : on_char:char -> int -> slice list t +(** [split_list_at_most ~on_char n] applies [split_1 ~on_char] at most + [n] times, to get a list of [n+1] elements. + The last element might contain [on_char]. This is useful to limit the + amount of work done by {!split_list}. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + + +val split_2 : on_char:char -> (slice * slice) t +(** [split_2 ~on_char] splits the input into exactly 2 fields, + and fails if the split yields less or more than 2 items. + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_3 : on_char:char -> (slice * slice * slice) t +(** See {!split_2} + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val split_4 : on_char:char -> (slice * slice * slice * slice) t +(** See {!split_2} + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val each_split : on_char:char -> 'a t -> 'a list t +(** [split_list_map ~on_char p] uses [split_list ~on_char] to split + the input, then parses each chunk of the input thus obtained using [p]. + + The difference with [sep ~by:(char on_char) p] is that + [sep] calls [p] first, and only tries to find [on_char] after [p] returns. + While it is more flexible, this technique also means [p] has to be careful + not to consume [on_char] by error. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +val all : slice t +(** [all] returns all the unconsumed input as a slice, and consumes it. + Use {!Slice.to_string} to turn it into a string. + + Note that [lookahead all] can be used to {i peek} at the rest of the input + without consuming anything. + + @since NEXT_RELEASE *) + +val all_str : string t +(** [all_str] accepts all the remaining chars and extracts them into a + string. Similar to {!rest_of_input} but with a string. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + +(* TODO +val trim : slice t +(** [trim] is like {!all}, but removes whitespace on the left and right. + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + *) val memo : 'a t -> 'a t (** Memoize the parser. [memo p] will behave like [p], but when called From 352fc10d3b64bd2158a3c32ab76ca378a475b990 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 22:49:43 -0400 Subject: [PATCH 13/24] more doc for CCParse --- src/core/CCParse.ml | 13 +++++++++++++ src/core/CCParse.mli | 14 +++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index 2fbdf451..175ef525 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -190,6 +190,19 @@ type state = { j: int; (* end pointer in [str], excluded. [len = j-i] *) memo : Memo_state.t option ref; (* Memoization table, if any *) } +(* FIXME: replace memo with: + [global : global_st ref] + + where: + [type global = { + mutable memo: Memo_state.t option; + line_offsets: int CCVector.vector; + } + + with line_offsets used to cache the offset where each line begins, + and is computed lazily, to make {!Position.line_and_column} + faster if called many times. + *) let[@inline] char_equal (a : char) b = Stdlib.(=) a b let string_equal = String.equal diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index c3a035c6..2edf94b5 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -203,7 +203,19 @@ val char : char -> char t type slice (** A slice of the input, as returned by some combinators such - as {!split_1} or {split_n}. + as {!split_1} or {split_list} or {!take}. + + The idea is that one can use some parsers to cut the input into slices, + e.g. split into lines, or split a line into fields (think CSV or TSV). + Then a variety of parsers can be used on each slice to extract data from + it using {!recurse}. + + Slices contain enough information to make it possible + for [recurse slice p] to report failures (if [p] fails) using locations + from the original input, not relative to the slice. + Therefore, even after splitting the input into lines using, say, {!each_line}, + a failure to parse the 500th line will be reported at line 500 and + not at line 1. {b EXPERIMENTAL} @since NEXT_RELEASE *) From d46a679b3bc78f6b18140431554efc7f0d4758e3 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 23:00:51 -0400 Subject: [PATCH 14/24] fix bug in CCParse.line, add `set_current_slice`, fix tests --- src/core/CCParse.ml | 36 +++++++++++++++++++++++++++--------- src/core/CCParse.mli | 6 ++++++ 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index 175ef525..d83b0f35 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -398,7 +398,8 @@ let all_str = all >|= Slice.to_string (*$= & ~printer:(errpp Q.Print.string) ~cmp:(erreq (=)) (Ok "abcd") (parse_string all_str "abcd") - (Ok "cd") (parse_string (string "ab" *> all_str) "ab") + (Ok "cd") (parse_string (string "ab" *> all_str) "abcd") + (Ok "") (parse_string (string "ab" *> all_str) "ab") *) (*$= & ~printer:(errpp Q.Print.(pair string string)) ~cmp:(erreq (=)) @@ -804,12 +805,21 @@ let lookahead_ignore p : _ t = { ~err } +let set_current_slice sl : _ t = { + run=fun _st ~ok ~err:_ -> + assert CCShims_.Stdlib.(_st.memo == sl.memo); + ok sl () (* jump to slice *) +} + (*$= & ~printer:(errpp Q.Print.(string)) (Ok "abc") (parse_string (lookahead (string "ab") *> (string "abc")) "abcd") *) -(*$= +(*$= & ~printer:(errpp Q.Print.(string)) (Ok "1234") (parse_string line_str "1234\nyolo") + *) + +(*$= & ~printer:(errpp Q.Print.(pair String.escaped String.escaped)) (Ok ("1234", "yolo")) (parse_string (line_str ||| line_str) "1234\nyolo\nswag") *) @@ -832,8 +842,12 @@ let split_1 ~on_char : _ t = { let split_list_at_most ~on_char n : slice list t = let rec loop acc n = - if n <= 0 then return (List.rev acc) - else ( + if n <= 0 then ( + (* add the rest to [acc] *) + all >|= fun rest -> + let acc = rest :: acc in + List.rev acc + ) else ( try_or eoi ~f:(fun _ -> return (List.rev acc)) @@ -859,17 +873,17 @@ let split_list ~on_char : _ t = split_list_at_most ~on_char max_int let split_2 ~on_char : _ t = - split_list_at_most ~on_char 2 >>= function + split_list_at_most ~on_char 3 >>= function | [a; b] -> return (a,b) | _ -> fail "split_2: expected 2 fields exactly" let split_3 ~on_char : _ t = - split_list_at_most ~on_char 3 >>= function + split_list_at_most ~on_char 4 >>= function | [a; b; c] -> return (a,b,c) | _ -> fail "split_3: expected 3 fields exactly" let split_4 ~on_char : _ t = - split_list_at_most ~on_char 4 >>= function + split_list_at_most ~on_char 5 >>= function | [a; b; c; d] -> return (a,b,c,d) | _ -> fail "split_4: expected 4 fields exactly" @@ -901,7 +915,11 @@ let each_split ~on_char p : 'a list t = loop [] let line : slice t = - split_1 ~on_char:'\n' >|= fst + split_1 ~on_char:'\n' >>= fun (sl, rest) -> + match rest with + | None -> return sl + | Some rest -> + set_current_slice rest >|= fun () -> sl let line_str = line >|= Slice.to_string @@ -909,7 +927,7 @@ let each_line p : _ t = each_split ~on_char:'\n' p (*$= & ~printer:(errpp Q.Print.(list @@ list int)) - (Ok ([[1;1];[2;2];[3;3]])) \ + (Ok ([[1;1];[2;2];[3;3];[]])) \ (parse_string (each_line (sep ~by:skip_space U.int)) "1 1\n2 2\n3 3\n") *) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 2edf94b5..1238a924 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -248,6 +248,12 @@ val recurse : slice -> 'a t -> 'a t {b EXPERIMENTAL} @since NEXT_RELEASE *) +val set_current_slice : slice -> unit t +(** [set_current_slice slice] replaces the parser's state with [slice]. + + {b EXPERIMENTAL} + @since NEXT_RELEASE *) + val chars_fold : f:('acc -> char -> [`Continue of 'acc | `Consume_and_stop | `Stop | `Fail of string]) -> From 9c763991ef1de7dc759d9dfe9ec1a31b309fc439 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 23:01:50 -0400 Subject: [PATCH 15/24] add example and test of an IRC log parser also add a sample of IRC logs of #ocaml on libera.chat, to make sure they parse properly. --- examples/ccparse_irclogs.ml | 78 ++++ examples/ccparse_sexp.ml | 2 +- examples/dune | 9 +- examples/test_data/irc-logs.txt | 777 ++++++++++++++++++++++++++++++++ 4 files changed, 864 insertions(+), 2 deletions(-) create mode 100644 examples/ccparse_irclogs.ml create mode 100644 examples/test_data/irc-logs.txt diff --git a/examples/ccparse_irclogs.ml b/examples/ccparse_irclogs.ml new file mode 100644 index 00000000..5fec08d8 --- /dev/null +++ b/examples/ccparse_irclogs.ml @@ -0,0 +1,78 @@ + +(* parse IRC logs *) + +type datetime = { + year: int; + month: int; + day: int; + hour: int; + min: int; + sec: int; +} + +let pp_datetime out d = + let {year;month;day;hour;min;sec} = d in + CCFormat.(fprintf out "{y=%d;M=%d;d=%d;h=%d;m=%d;s=%d}" + year month day hour min sec) + +type msg = { + timestamp: datetime; + user: string; + msg: string; +} + +let pp_msg out m = + CCFormat.fprintf out "{@[time=%a;@ user=%S;@ msg=%S@]}" + pp_datetime m.timestamp m.user m.msg + +open CCParse + +let p_datetime : datetime t = + let int = U.int in + let* date, time = split_2 ~on_char:' ' in + let* y, m, d = recurse date (split_3 ~on_char:'-') in + let* year = recurse y int in + let* month = recurse m int in + let* day = recurse d int in + let* hour, min, sec = + recurse time + (let* hour = int in + char ':' *> + let* min = int in + char ':' *> + let+ sec = int in + hour,min,sec) + in + let dt = {year;month;day;hour;min;sec} in + return dt + +let p_line = + let* line = lookahead all in + + if Slice.is_empty line then return None + else ( + let* fields = split_list ~on_char:'\t' in + match fields with + | [date; user; rest] -> + let+ timestamp = recurse date p_datetime + and+ user = recurse user (chars_if (function '>' -> false | _ -> true)) + and+ msg = recurse rest (all_str >|= String.trim) in + Some {timestamp; user; msg} + + | _ -> + failf "expected 3 fields, got [%s]" + (String.concat ";" @@ List.map String.escaped @@ List.map Slice.to_string fields) + ) + +let p_file = + each_line (parsing "line" p_line) >|= + CCList.keep_some + +let () = + let s = CCIO.File.read_exn Sys.argv.(1) in + match parse_string p_file s with + | Ok l -> + Format.printf "parsed:@."; + List.iter (Format.printf "%a@." pp_msg) l + | Error e -> + Format.printf "parse error: %s@." e; exit 1 diff --git a/examples/ccparse_sexp.ml b/examples/ccparse_sexp.ml index 369542ab..fc5590a9 100644 --- a/examples/ccparse_sexp.ml +++ b/examples/ccparse_sexp.ml @@ -20,7 +20,7 @@ let skip_white_and_comments = ) let atom = - chars_fold_map `Start + chars_fold_transduce `Start ~f:(fun acc c -> match acc, c with | `Start, '"' -> `Continue `In_quote diff --git a/examples/dune b/examples/dune index 554c2165..e2a6a711 100644 --- a/examples/dune +++ b/examples/dune @@ -1,6 +1,6 @@ (executables - (names id_sexp ccparse_sexp) + (names id_sexp ccparse_sexp ccparse_irclogs) (libraries containers) ;(modules id_sexp) (flags :standard -warn-error -a+8 -safe-string -color always) @@ -20,3 +20,10 @@ (action (ignore-stdout (run ./ccparse_sexp.exe test_data/benchpress.sexp)))) + +(alias + (name runtest) + (locks /ctest) + (action + (ignore-stdout + (run ./ccparse_irclogs.exe test_data/irc-logs.txt)))) diff --git a/examples/test_data/irc-logs.txt b/examples/test_data/irc-logs.txt new file mode 100644 index 00000000..506ed1aa --- /dev/null +++ b/examples/test_data/irc-logs.txt @@ -0,0 +1,777 @@ +2021-06-04 00:50:44 kluk> How do I start using DynArray from the ocaml command line? +2021-06-04 00:50:51 kluk> I have already done opam install extlib +2021-06-04 00:51:12 kluk> I am a newbie at OCaml +2021-06-04 05:18:03 dockerusocamlus> Hello! I'm minimizing an Alpine-based Docker image with OCaml installed via opam, and I'm trying to understand if I could erase some files to save some space. Basically, trying to understand if they are needed only on special situations, or if that would cause issues for users of the Docker image. +2021-06-04 05:19:46 dockerusocamlus> For instance, in this image, I have file ~/.opam//lib/ocaml/expunge, which take 15 MB of space. I don't think I have ever used it, but I don't know if it's internally used by some other OCaml process. +2021-06-04 05:28:12 dockerusocamlus> I don't have much documentation about it, and grepping ocaml's sources only shows a few occurrences. It seems related to the installation of the OCaml compiler itself, but even after removing it, I'm still able to do a `opam switch create` to install a different compiler, so... I guess it's fine to remove it? +2021-06-04 05:36:13 octachron> This is a compiler tool which is used to build REPLs. It is also used by utop. +2021-06-04 05:42:54 dockerusocamlus> Thanks! +2021-06-04 08:10:44 superherointj> Need some feedback on a minimalistic lwt demo: https://github.com/superherointj/lwt-demo1 +2021-06-04 08:38:37 d_bot> Just solved it. I must be really tired. +2021-06-04 09:49:45 d_bot> Can anybody point me to a good article/information on incompatible ppx drivers (ppxlib and ocaml-migrate-parsetree)? +2021-06-04 09:49:46 d_bot> I have read already the saga blog post, but I am missing something. +2021-06-04 09:49:47 d_bot> I want to build my old project. I'm trying to replicate problem atm on a demo. +2021-06-04 09:50:25 companion_cube> people are supposed to use ppxlib, that's all I know +2021-06-04 09:51:25 d_bot> Any example? +2021-06-04 09:51:51 companion_cube> https://github.com/ocaml-ppx/ppx_deriving I guess? +2021-06-04 09:52:40 d_bot> Found this: +2021-06-04 09:52:41 d_bot> https://ppxlib.readthedocs.io/_/downloads/en/stable/pdf/ +2021-06-04 09:57:49 d_bot> Why does OCaml not optimizes this in a noop? Even under flambda and -O3 +2021-06-04 09:57:49 d_bot> +2021-06-04 09:57:51 d_bot> ```ocaml +2021-06-04 09:57:52 d_bot> let f (a, b) = (a, b) +2021-06-04 09:57:53 d_bot> ``` +2021-06-04 10:00:07 @adrien> it returns a new tuple, not the same one +2021-06-04 10:00:37 @adrien> let x = (1,2);; let f (a, b) = (a, b);; let y = f x;; y == x;; +2021-06-04 10:00:41 d_bot> the question is why? It would change the `==` behavior but it's already not defined from what I remember +2021-06-04 10:01:06 d_bot> it behaves differently in bytecode, native and IIRC it's also different in flambda +2021-06-04 10:01:14 companion_cube> I agree it'd be a valid optim +2021-06-04 10:02:19 d_bot> This is especiall try for the case of different types and pattern matching but that generates identical data in memory, like +2021-06-04 10:02:20 d_bot> +2021-06-04 10:02:21 d_bot> ```ocaml +2021-06-04 10:02:22 d_bot> type a = | A(int) +2021-06-04 10:02:24 d_bot> type b = B(int) +2021-06-04 10:02:25 d_bot> let f = function | A v -> B v +2021-06-04 10:02:26 d_bot> ``` +2021-06-04 10:02:36 @adrien> I get the same behaviour in native +2021-06-04 10:03:11 @adrien> and you can do f u = u +2021-06-04 10:03:13 companion_cube> @eduardors these are only the same by accident though +2021-06-04 10:03:18 companion_cube> seems far less useful as an optim +2021-06-04 10:03:18 zozozo> see https://github.com/ocaml/ocaml/pull/8958 +2021-06-04 10:03:22 d_bot> yes but the compiler knows it +2021-06-04 10:03:32 @adrien> not sure how is the generated code but in that case it's not a new tuple +2021-06-04 10:04:02 zozozo> there is a PR to do pretty much that (the link I posted above) +2021-06-04 10:04:05 theblatte> I keep writing functions like `let f ((a,b) as x0) = let a' = g a in let b' = g b in if a == a' && b == b' then x0 else (a', b')` +2021-06-04 10:04:07 d_bot> in this case yes, but not all cases, I'm not asking about this specific tuple, I'm asking more about identical blocks that are known to be always identical +2021-06-04 10:04:13 companion_cube> I don't think it's a very useful optimization to see if per chance two different variants of different types happen to have the same binary representation +2021-06-04 10:04:33 companion_cube> more important stuff is to eliminate temporaries imho +2021-06-04 10:04:41 companion_cube> like a tuple built just to be deconstructed in the same function +2021-06-04 10:04:45 companion_cube> (or an option…) +2021-06-04 10:04:53 zozozo> companion_cube: what do you mean by "temporaries"? +2021-06-04 10:05:05 d_bot> temporary allocations IIUC +2021-06-04 10:05:06 companion_cube> data that doesn't escape the current function :p +2021-06-04 10:05:09 zozozo> companion_cube: ah, well, avoiding these is more or less exactly the job of flambda, ^^ +2021-06-04 10:05:11 companion_cube> (after inlining) +2021-06-04 10:05:12 companion_cube> yeah I know +2021-06-04 10:05:17 companion_cube> godspeed to you zozozo +2021-06-04 10:05:30 zozozo> ^^ +2021-06-04 10:05:55 zozozo> @EduardoRFS : did you look at https://github.com/ocaml/ocaml/pull/8958 ? +2021-06-04 10:07:07 d_bot> I'm looking on it, the argument of not being predictable is sad, it's a flat allocation reduction, no hidden allocation, not trying to make non efficient code efficient, but trying to make code that is efficient as possible more efficient +2021-06-04 10:07:34 zozozo> companion_cube: also, note that sometimes, because of type subtleties, you need to write the "identity" function, as a pattern match that then reconstructs exactly the same value, but with a slightly different type (thing GADTs), in such cases, being able to detect that a switch returns exactly its argument, is a nice improvements, and you can't really write it differently because of the types +2021-06-04 10:07:36 @adrien> well, as theblatte said, the "as" construct should help for that case +2021-06-04 10:08:04 d_bot> that's exactly the case zozozo, for a lot of code in ocaml-migrate-types +2021-06-04 10:08:11 theblatte> zozozo: yes! +2021-06-04 10:08:19 zozozo> the advantage of the PR I linked is that it can trigger in cases where one cannot write code using "as" +2021-06-04 10:08:27 theblatte> much sad when that happens +2021-06-04 10:08:30 d_bot> "as"? +2021-06-04 10:08:53 zozozo> @EduardoRFS : `let f ((a, b) as pair) = pair` +2021-06-04 10:09:13 d_bot> oh but that works only for structural types +2021-06-04 10:09:21 companion_cube> zozozo: good argument against GADTs ;) +2021-06-04 10:09:34 d_bot> companion_cube loves GADTs +2021-06-04 10:09:42 companion_cube> heh, in small doses +2021-06-04 10:09:47 theblatte> companion_cube: no need for GADTs! https://github.com/facebook/infer/blob/cfed4c4fa0c99ab1f42683bb92df76c8c8434e79/infer/src/pulse/PulseSummary.ml#L56 +2021-06-04 10:10:03 olle> as? +2021-06-04 10:10:06 olle> new keyword? +2021-06-04 10:10:13 companion_cube> wait, theblatte, why +2021-06-04 10:10:14 theblatte> eg phantom type parameters +2021-06-04 10:10:18 companion_cube> ah yes +2021-06-04 10:10:29 companion_cube> it's unfortunate +2021-06-04 10:10:34 theblatte> (in my case not phantom but "phantom" because it doesn't show up in some of the variants) +2021-06-04 10:10:37 companion_cube> but it's the same constructors in this case. +2021-06-04 10:10:38 zozozo> companion_cube: gadts are useful *sometimes* +2021-06-04 10:10:40 d_bot> I wonder if #8958 would be better as lambda layer +2021-06-04 10:10:44 d_bot> but tempting to rebase it ;/ +2021-06-04 10:11:25 zozozo> @EduardoRFS : the problem is that if you do that at lambda level, you miss out on situations where it happens after some inlining/simplification +2021-06-04 10:11:47 d_bot> yeah but you ensure same behavior between all backends +2021-06-04 10:11:50 zozozo> (also, the code of lambda simplifications is quite a mess from what I hear) +2021-06-04 10:12:33 theblatte> companion_cube: same constructors: yes, personally I would only care about preserving physical equality when the objects are actually equal but ymmv +2021-06-04 10:12:34 zozozo> well.. there is now a pass specifically designed to implement optimizations, so why not use it ? +2021-06-04 10:13:05 theblatte> I've seen several examples where it would have a material effect on perf +2021-06-04 10:13:08 d_bot> But the pass should not change behavior of code unless it provides a fallback, this is how I see most of it +2021-06-04 10:13:13 d_bot> maybe Sys.opaque_identity would ignore it +2021-06-04 10:13:32 d_bot> can we deprecate ==? That seems like a better idea overall +2021-06-04 10:13:34 companion_cube> zozozo: because it only works for native? +2021-06-04 10:13:41 companion_cube> ahahah +2021-06-04 10:13:47 companion_cube> removing == kills perf for other programs +2021-06-04 10:14:01 theblatte> #8958 ftw, I didn't know there'd been such a PR in flight for such a long time +2021-06-04 10:14:04 zozozo> companion_cube: well, bytecode is pretty much meant to not care about performance, so from that point of view it's not unreasonable +2021-06-04 10:14:05 d_bot> not removing it, deprecating it, keep it under Obj.xx +2021-06-04 10:14:34 theblatte> == is an important part of the language, not an extension +2021-06-04 10:14:41 zozozo> the *good* solution would be to change the bytecode generation to use the result of flambda +2021-06-04 10:14:56 zozozo> the semantics of "==" is largely not officially specified +2021-06-04 10:14:56 d_bot> NAH +2021-06-04 10:15:10 theblatte> but not a bad idea to not give it such an easily-confused name :p eg use "phys_equal" instead +2021-06-04 10:15:12 zozozo> and for any non-mutable record, there are next to no guarantees about "==" +2021-06-04 10:15:26 d_bot> unless we had a blazing fast flambda pass, bytecode is so fast right now +2021-06-04 10:16:22 d_bot> == is not exactly part of the language in many ways, and it's known to behave differently depending on the backend which should never happen for a specified feature of the language +2021-06-04 10:16:30 zozozo> @EduardoRFS: are you talking about compilation time or runtime of the compild program ? +2021-06-04 10:16:35 d_bot> compilation time +2021-06-04 10:16:36 companion_cube> zozozo: I wish I could agree +2021-06-04 10:16:40 companion_cube> but some of us are stuck with bytecode +2021-06-04 10:16:45 d_bot> bytecode is slow in runtime, really slow +2021-06-04 10:16:46 companion_cube> because that's the only official toplevel for now +2021-06-04 10:17:10 d_bot> but bytecode generated from flambda would still work with the toplevel +2021-06-04 10:17:16 zozozo> companion_cube: yeah, but sometimes with others in my team, we talk about making it so that bytecode is generated after the flambda pass, which would solve all problems (if we can make it work) +2021-06-04 10:17:21 companion_cube> sure +2021-06-04 10:17:36 companion_cube> I mean in the future maybe we'll also have a JIT +2021-06-04 10:17:42 d_bot> there is any plan on deprecating closure middle end? +2021-06-04 10:17:45 companion_cube> but for now it's not like there's a choice, and there's basically 0 optims on bytecode +2021-06-04 10:17:47 companion_cube> which… ugh +2021-06-04 10:19:26 d_bot> it remmembers me one time when people compared ocsigenserver and http servers and used the bytecode version accidentally and say, OCaml is so bad +2021-06-04 10:19:34 companion_cube> :D +2021-06-04 10:19:38 d_bot> D: +2021-06-04 10:19:49 companion_cube> or even using dune without --profile=release +2021-06-04 10:19:53 companion_cube> bye bye optims +2021-06-04 10:19:58 d_bot> TEZOS IS RUNNING WITHOUT PROFILE=RELEASE +2021-06-04 10:20:25 d_bot> even worse it is benchmarked without profile=release +2021-06-04 10:20:33 companion_cube> hu, weirder +2021-06-04 10:21:18 zozozo> well, if the switch is not using flambda, I don't think the difference is that important between the dev and release profiles +2021-06-04 10:22:34 companion_cube> err, you still have a bit of cross module inlining, don't you? +2021-06-04 10:22:39 companion_cube> with normal ocamlopt +2021-06-04 10:22:54 zozozo> I'm not sure +2021-06-04 10:22:55 d_bot> yeah it makes difference, I benchmarked it, around 30% boost on some smart contracts +2021-06-04 10:23:06 d_bot> dune without profile=release runs under -opaque +2021-06-04 10:23:10 companion_cube> I think it does, including for stuff like externals +2021-06-04 10:23:16 companion_cube> exactly +2021-06-04 10:23:25 companion_cube> --profile=release brings you back to normal behavior +2021-06-04 10:23:26 zozozo> I think (but I'm not sure) the only thing cross-inlined would be externals, but those are in the .mlis so no need for cross-optimization actually +2021-06-04 10:23:30 d_bot> externals rely on the interface, so it doesn't depend on profile=release +2021-06-04 10:23:50 companion_cube> zozozo: but the .cmx ? +2021-06-04 10:24:00 theblatte> is profile=release different than passing -O3 to ocamlopt?? +2021-06-04 10:24:05 zozozo> ah, maybe the small functions that closure unconditionally inline are inliend cross-modules by vanilla ocamlopt +2021-06-04 10:24:17 d_bot> it is, because without profile=release you're under -opaque +2021-06-04 10:24:30 theblatte> whaaaat +2021-06-04 10:24:40 theblatte> :o +2021-06-04 10:24:44 d_bot> that's the only way to achieve blazing fast build speed +2021-06-04 10:24:53 companion_cube> zozozo: the functions marked "inline" in .cmx files +2021-06-04 10:24:56 d_bot> yup, small functions like having `Module.equal` are not inlined and Module.equal a lot of times is literally a single cnstruction +2021-06-04 10:25:09 theblatte> blazing fast = 6x slower than without -O3 ^^ +2021-06-04 10:25:11 companion_cube> that's what I was talking about +2021-06-04 10:25:21 zozozo> companion_cube: indeed, ^^ +2021-06-04 10:25:30 companion_cube> so it can make a big difference :) +2021-06-04 10:25:35 companion_cube> even without flambda +2021-06-04 10:25:45 theblatte> ohhh, recently-ish we noticed marking some functor arguments as [@inline] made a big difference +2021-06-04 10:25:52 companion_cube> :D +2021-06-04 10:25:59 zozozo> that's not surprising +2021-06-04 10:26:04 theblatte> is that sort of thing (adding @inline) not needed with flambda + release profile? +2021-06-04 10:26:25 theblatte> or is that independent? +2021-06-04 10:26:26 companion_cube> it still gives you better control +2021-06-04 10:26:34 zozozo> iirc, flambda tries as much as possibvle to inline functor applicaiton that are at toplevel, so you shouldn't need the annotations in that particular case +2021-06-04 10:26:51 companion_cube> do a lot of people use flambda1 in production?! +2021-06-04 10:26:59 zozozo> companion_cube: jane street i guess ? +2021-06-04 10:27:07 companion_cube> ahah maybe they have enough RAM +2021-06-04 10:27:16 zozozo> also, the binary release of dolmen is now compiled with flambda, :D +2021-06-04 10:27:18 companion_cube> I stopped using it years ago +2021-06-04 10:27:18 theblatte> infer is 30% faster with flambda, so you bet +2021-06-04 10:27:32 companion_cube> wow +2021-06-04 10:27:37 companion_cube> well can't wait for flambda2 +2021-06-04 10:28:01 companion_cube> anyway, the point of --profile=release is to tell dune to not block optimizations, it doesn't enable new ones +2021-06-04 10:28:05 companion_cube> for that you can use ocamlopt_flags +2021-06-04 10:28:13 d_bot> tezos is another 20% faster on flambda +2021-06-04 10:28:15 zozozo> we're trying very hard on making it so that flambda2 is as fast as possible, but it's hard sometimes +2021-06-04 10:28:27 companion_cube> zozozo: it's not just a question of "fast" +2021-06-04 10:28:35 companion_cube> it's also "not gobble up RAM on bad cases" +2021-06-04 10:28:38 theblatte> yes but I'm trying to understand if adding --profile=release will make a difference +2021-06-04 10:28:51 theblatte> I'll try that +2021-06-04 10:29:01 companion_cube> so, -p foo already switches to release mode +2021-06-04 10:29:12 companion_cube> it's only if you use `dune build @all` and that kind of stuff that it matters +2021-06-04 10:29:21 zozozo> companion_cube: right, can you send me (if you recall), the packages that were not working 'or taking ut too much RAM) ? +2021-06-04 10:29:24 companion_cube> it makes compilation slower (removes -opaque) but enables optimization +2021-06-04 10:29:27 companion_cube> zozozo: at least dose3 +2021-06-04 10:29:30 companion_cube> that was the blocker +2021-06-04 10:29:32 zozozo> so that we can at least try and see what happens with flamdba2 +2021-06-04 10:29:32 companion_cube> and camlp4 +2021-06-04 10:29:35 d_bot> even the new dose3? +2021-06-04 10:29:52 d_bot> dose3 6 changed quite a bit of stuff, even parmap they're using now +2021-06-04 10:30:34 theblatte> companion_cube: we do "dune build infer.exe" +2021-06-04 10:31:29 companion_cube> lol +2021-06-04 10:31:39 companion_cube> yeah you need the flag +2021-06-04 10:31:54 companion_cube> idk about dose3 6 +2021-06-04 10:32:01 companion_cube> I stopped trying flambda a while ago +2021-06-04 10:32:17 companion_cube> using too much ram is a big problem imho +2021-06-04 10:32:45 d_bot> that seems weird, flambda reduces the number of allocations considerably +2021-06-04 10:33:30 companion_cube> per module +2021-06-04 10:33:38 companion_cube> with this you might also gain cross module +2021-06-04 10:33:54 theblatte> ah I thought you meant too much ram used during compilation :) +2021-06-04 10:34:09 companion_cube> that's what I meant yes +2021-06-04 10:34:11 companion_cube> sorry +2021-06-04 10:34:18 companion_cube> but theblatte, try the flag :p +2021-06-04 10:34:26 d_bot> yeah makes sense +2021-06-04 10:34:29 theblatte> companion_cube: I am!! +2021-06-04 10:34:30 companion_cube> and also, make sure .cmx are installed for all libraries +2021-06-04 10:34:52 d_bot> do we have an idea on what leads flambda to use so much memory? +2021-06-04 10:34:57 theblatte> companion_cube: how? +2021-06-04 10:35:14 companion_cube> well most should do it if they use dune +2021-06-04 10:35:25 d_bot> Is there any info on flambda2 floating around yet? +2021-06-04 10:35:36 companion_cube> there's zozozo's brain +2021-06-04 10:35:40 companion_cube> although it's not floating +2021-06-04 10:39:04 d_bot> technically, his brain is floating in his skull +2021-06-04 10:39:15 companion_cube> he might be a robot +2021-06-04 10:39:17 companion_cube> can't be sure +2021-06-04 10:39:27 d_bot> if he is doing flambda2 he is a robot +2021-06-04 10:40:07 zozozo> right, I can try and answer questions about flambda2 +2021-06-04 10:40:17 zozozo> since I'm working on it, ^^ +2021-06-04 10:41:07 companion_cube> it'll be the default if it works well enough, right? +2021-06-04 10:41:53 zozozo> that's the plan +2021-06-04 10:43:01 companion_cube> 🤞 +2021-06-04 10:43:57 d_bot> Hmm, I'm not sure I know enough about it to ask good questions +2021-06-04 10:45:07 d_bot> Although maybe "what was not adequate about the first flambda design" is an obvious one +2021-06-04 10:45:29 theblatte> companion_cube: ah, but actually we never use dune default profiles, we do --profile=opt (or dev). There's no -opaque in the build logs +2021-06-04 10:45:41 companion_cube> ah, I see +2021-06-04 10:45:47 theblatte> phew :) +2021-06-04 10:45:49 companion_cube> (wait, there's a profile=opt??) +2021-06-04 10:46:01 theblatte> you can name your profile however you want :p +2021-06-04 10:46:40 zozozo> @ggole: basically, flambda2 now uses a CPS representation of source code, which is very useful (whereas flambda1 had an ANF representation iirc) +2021-06-04 10:46:40 theblatte> then we have (env (opt (ocamlopt_flags (:standard -O3))), etc. +2021-06-04 10:47:35 theblatte> maybe we should have -opaque for profile=dev though! +2021-06-04 10:47:52 d_bot> wondering, when the optimization mentioned in 8958 may be triggered after inlining? +2021-06-04 10:48:19 d_bot> It would be weird if flambda allocated two identical temporary blocks +2021-06-04 10:48:30 d_bot> I also have a question on flambda 2.0 +2021-06-04 10:48:37 d_bot> @guigui CPS is an interesting direction. It used to be the IL style of choice, but seems to have gone right out of favour. +2021-06-04 10:49:04 zozozo> Drup: fire away, ^^ +2021-06-04 10:49:07 d_bot> Do you (the flambda team) intend to keep working on it instead of instantly decide to shoot the for moon and work on flambda 3.0 ? +2021-06-04 10:49:36 companion_cube> lolol +2021-06-04 10:49:39 companion_cube> I could say the same of ppx +2021-06-04 10:49:44 zozozo> Drup: the plan is to continue working on flambda2 +2021-06-04 10:50:14 d_bot> Although people who use ANF seem to have discovered the need for very continuation-like constructs with join points +2021-06-04 10:50:17 zozozo> basically, doing flambda1 gave the team (note that this was before I joined) some insights about how to do and not to do some things +2021-06-04 10:50:17 d_bot> (you don't have to answer it, it's friday evening, and I know you don't really have a sway on this all that much) +2021-06-04 10:50:50 zozozo> Drup: indeed, but I'm right now in a conference call with Pierre so I can ask him, ^^ +2021-06-04 10:51:02 d_bot> Say hello from me :p +2021-06-04 10:51:22 zozozo> Drup: he says hello to you too +2021-06-04 10:52:18 theblatte> hi pchambart :) +2021-06-04 10:52:48 companion_cube> coucou to him +2021-06-04 10:52:58 d_bot> but yeah, flambda in general is a bit moonshot infused sometimes. I understand why (it's much more fun to work on "The Perfect IR") but it's a bit infuriating. +2021-06-04 10:53:28 companion_cube> like multicore has been for a while, too +2021-06-04 10:53:31 companion_cube> or even opam 2.1 +2021-06-04 10:53:36 companion_cube> seems like a common theme in OCaml :p +2021-06-04 10:53:37 theblatte> companion_cube: alright so something good still came out of that: compiling with -opaqe turns a 50s full build into a 40s one \o/ and I assume it's even better for incremental build? +2021-06-04 10:53:42 zozozo> yeah, but now with flambda2 we should have a good enough IR to do what we want and need +2021-06-04 10:54:11 companion_cube> theblatte: err it's faster builds, but slower code, yes +2021-06-04 10:54:12 d_bot> let's hope so +2021-06-04 10:54:34 theblatte> companion_cube: it's for "dev" builds +2021-06-04 10:54:49 companion_cube> then yes +2021-06-04 10:55:07 companion_cube> with -opaque you have fully separate compilation +2021-06-04 10:55:24 theblatte> I was wondering why dune was doing so much work on incremental compilation ^^ +2021-06-04 10:55:31 theblatte> thanks! +2021-06-04 10:56:35 d_bot> (I though dune already added `-opaque` for dev builds) +2021-06-04 10:57:05 d_bot> @guigui what was difficult before that's easy now? +2021-06-04 10:57:06 companion_cube> seems like theblatte has his own profiles +2021-06-04 10:57:37 companion_cube> zozozo: so in CPS, do you have 2 "kinds" of function calls? normal and continuations? +2021-06-04 10:57:42 companion_cube> to make sure there's no new closures? +2021-06-04 10:57:53 d_bot> That doesn't seem very smart if those are less though-out than the normal ones :3 +2021-06-04 10:57:56 theblatte> dune profiles have... weird defaults +2021-06-04 10:58:24 theblatte> fair enough :p +2021-06-04 10:59:06 zozozo> companion_cube: continuations in flambda2 are more along the lines of static jumps +2021-06-04 10:59:12 companion_cube> cool +2021-06-04 10:59:33 companion_cube> zozozo: please stop delaying the PR for ocaml.org +2021-06-04 10:59:33 companion_cube> plz +2021-06-04 11:00:48 zozozo> sorry, ^^ +2021-06-04 11:00:57 companion_cube> why does a PR against a fracking website take a full week to be merged anyway +2021-06-04 11:01:29 zozozo> right, that's a problem +2021-06-04 11:02:22 companion_cube> if you want the website to go stale because no one opens a PR to update it, that's the best way to go +2021-06-04 11:02:38 octachron> companion_cube, because there is noone clearly responsible? My commit right is normally mostly for OCaml releases +2021-06-04 11:03:07 companion_cube> is Anil trying to do too many things? :p +2021-06-04 11:03:21 companion_cube> definitely not blaming you octachron +2021-06-04 11:04:36 companion_cube> just annoyed that this, which should have taken literally 5 minutes, is taking a week +2021-06-04 11:04:41 theblatte> interesting, -opaque seems to make no difference for incremental compilation, only for full compilation +2021-06-04 11:04:46 companion_cube> during which the information on the website is misleading +2021-06-04 11:05:14 companion_cube> theblatte: try modifying a file deep in the dep graph, but only the implementation, not the interface +2021-06-04 11:05:22 theblatte> that's what I tried +2021-06-04 11:05:36 companion_cube> hu +2021-06-04 11:06:25 theblatte> humm, there's a leftover -opaque in the logs, my experiment must have gone wrong, sorry, digging in further +2021-06-04 11:11:27 d_bot> theblatte: also opaque allows to build strictly against cmi which leads to better parallelism if you're using mli well +2021-06-04 11:12:30 d_bot> so opaque should definitely matter for incremental as without it you need to rebuilt the full tree if any module changes +2021-06-04 11:12:36 d_bot> maybe dune doesn't have this implemented? +2021-06-04 11:12:48 d_bot> @rgrinberg any idea here? +2021-06-04 11:13:00 theblatte> I think because we use the "dev" name for our profile -opaque was already being passed! +2021-06-04 11:13:48 theblatte> even though we override (flags ...) +2021-06-04 11:13:53 theblatte> but not ocamlopt_flags +2021-06-04 11:15:11 octachron> companion_cube, anyway my week ended 15 minutes ago, so the PR is merged. +2021-06-04 11:16:16 theblatte> and we still see a win for the full build by forcing -opaque because it passes it in a bunch of places where dune doesn't by default +2021-06-04 11:16:58 theblatte> looks like that's when building the entire libraries' .cmx +2021-06-04 11:17:21 @adrien> octachron: thanks :) +2021-06-04 11:17:46 theblatte> so, hmmm, *shrug* +2021-06-04 11:39:10 companion_cube> octachron: 😂 thank you +2021-06-04 11:43:12 companion_cube> and the website is updated already, nice +2021-06-04 11:46:07 companion_cube> "variant constructor unboxing" that's nice +2021-06-04 11:46:16 companion_cube> didn't we discuss it here recently? +2021-06-04 11:46:21 companion_cube> perhaps about bitvectors +2021-06-04 11:51:05 olle> oooooh +2021-06-04 13:58:46 zozozo> @ggole : sorry for the delay, basically, control flow manipulation is much easier in cps form, also inlining a function's body is tricky to do in ANF (and can be exponential in the worst case if you need to ensure the result if in strict ANF) +2021-06-04 13:59:23 companion_cube> coudl you post a snippet of a tiny CPS AST? :p +2021-06-04 13:59:39 companion_cube> sth where we could see let, application, and like a primitive like + ? +2021-06-04 13:59:44 zozozo> sure +2021-06-04 13:59:56 companion_cube> 👍 +2021-06-04 14:00:08 companion_cube> I want to see how the continuations are represented +2021-06-04 14:07:32 zozozo> https://gist.github.com/Gbury/7a02a35cb4906914fa351183490f11b2 +2021-06-04 14:07:44 zozozo> basically, a continuation is a (unique) integer +2021-06-04 14:08:05 zozozo> companion_cube: ^ +2021-06-04 14:09:06 companion_cube> so, apply_cont is where you jump +2021-06-04 14:09:09 zozozo> yup +2021-06-04 14:09:29 zozozo> also, after a function call (i.e. Apply_expr), you call the given continuation with the return value of the function call +2021-06-04 14:09:35 companion_cube> and why is there 2 let? +2021-06-04 14:09:42 companion_cube> yeah +2021-06-04 14:09:49 companion_cube> and you call the function on already computed arguments +2021-06-04 14:09:59 zozozo> you can bind continuations, and regular expressions +2021-06-04 14:10:37 companion_cube> hmmm +2021-06-04 14:10:54 companion_cube> I mean, Let_expr makes sense, it's a local definition, ok +2021-06-04 14:11:00 companion_cube> but what's the "handler" in Let_cont? +2021-06-04 14:11:00 zozozo> yup +2021-06-04 14:11:07 zozozo> the code of the continuation +2021-06-04 14:11:17 companion_cube> oh shit ok +2021-06-04 14:11:17 zozozo> let_cont k args = handler in body +2021-06-04 14:11:22 companion_cube> nice +2021-06-04 14:11:43 zozozo> note that continuations are local to a function's body and cannot escape +2021-06-04 14:11:44 companion_cube> so patmatch could also create such expressions, for example +2021-06-04 14:11:55 zozozo> since continuations are not regular value (i.e. simples or named) +2021-06-04 14:11:55 companion_cube> with explicit sharing and everything +2021-06-04 14:12:02 zozozo> yes +2021-06-04 14:12:29 companion_cube> (I imagine switch could also have a default case) +2021-06-04 14:12:49 zozozo> in this case no, the switch has no default case +2021-06-04 14:12:56 zozozo> it simplifies some things +2021-06-04 14:13:07 zozozo> but in theory it could +2021-06-04 14:13:08 companion_cube> even in flambda2? +2021-06-04 14:13:17 companion_cube> I guess since you can share continuations, it's ok +2021-06-04 14:13:24 zozozo> it's just that having no default case means the code is much more regular +2021-06-04 14:13:29 zozozo> you can fold on the arms of the switch +2021-06-04 14:13:41 zozozo> and not have to specifically treat the default case +2021-06-04 14:15:30 companion_cube> heh, fair enough +2021-06-04 14:16:03 companion_cube> I think the insight that continuations are not values, is sth I didn't realize +2021-06-04 14:16:05 companion_cube> so thank you! :) +2021-06-04 14:16:27 zozozo> no problem, ^^ +2021-06-04 14:30:12 d_bot> zozozo: hmm, that's actually pretty close to what I expected. Thanks for taking the time to write it up. +2021-06-04 14:33:07 d_bot> When I tried CPS ILs I found it difficult to perform what should be simple transformations like commuting `case` expressions, but perhaps my approach was too naive. +2021-06-04 14:37:04 zozozo> @ggole : well, commuting switches would be quite complicated indeed (and isn't done currently in flambda2) +2021-06-04 14:38:59 d_bot> That's one benefit of a more lambda-calculus like IL, it's quite easy to do context-directed optimisations (of which commuting is probably the most significant) +2021-06-04 14:39:37 zozozo> yeah, but then again, I don't think commuting is really something that we want to do in flambda2 +2021-06-04 14:39:39 d_bot> But there are downsides with scope +2021-06-04 14:39:55 d_bot> will flambda2 carry through to faithful CPS compilation or what +2021-06-04 14:40:21 zozozo> @colin : I'm not sure what you mean ? +2021-06-04 14:41:00 d_bot> SML/NJ style CPS all the way? Seems unlikely. +2021-06-04 14:41:03 d_bot> I've seen compilers that use CPS as an IR yet blast to something slightly different to compile to something that still uses a runtime stack +2021-06-04 14:41:22 d_bot> Yeah, I don't think SML/NJ or MLton can be described as using CPS to much of an extent nowadays tbh +2021-06-04 14:41:57 d_bot> I thought SML/NJ still used that for their `Cont` implementation +2021-06-04 14:41:57 zozozo> ah well, the flambda IR is in CPS, but there will be no change to the other IR of the compiler, so that's that, ^^ +2021-06-04 14:43:13 d_bot> is the Apply_cont constructor in this cps.ml file representing "contificated"/static continuations? +2021-06-04 14:43:43 zozozo> yeah, it represents static continuations bound previously by a Let_cont +2021-06-04 14:43:59 d_bot> interesting, I've only ever seen the IR presented in Appel's CwC book +2021-06-04 14:44:30 d_bot> There's a nice paper on an CPS IR a bit like this that you might be interested in +2021-06-04 14:44:36 d_bot> is it by Kennedy +2021-06-04 14:44:42 d_bot> Yeah +2021-06-04 14:44:56 d_bot> yeah, I've seen that as well actually, it's the one most people seem to go with I think +2021-06-04 14:45:17 d_bot> Makes a lot of sense if you aren't supporting call/cc +2021-06-04 14:45:18 companion_cube> zozozo: what comes after flambda? something with a control flow graph already? +2021-06-04 14:45:36 zozozo> companion_cube: after flambda, it's cmm +2021-06-04 14:46:07 d_bot> been a while since I've toyed with CPSing compilers because very few go the full mile with the whole "no runtime stack" - they go the chicken route and use it as a GC nursery because they can't get their C compiler to do the strict (tail) call -> jumps that CPS requires and LLVM certainly can't handle CPS so you're just stuck writing your own back-end each time +2021-06-04 14:46:17 zozozo> (fun factoid: cmm quite literraly means C minus minus, :p ) +2021-06-04 14:46:56 d_bot> If the continuations are second class as in this example, then you can probably linearise to SSA fairly successfully +2021-06-04 14:47:25 companion_cube> hmm so cmm still has function calls and expressions, but no types, right? +2021-06-04 14:47:33 d_bot> I just think going from ANF -> LLVM (SSA) is simpler +2021-06-04 14:47:41 d_bot> Although there's the usual complications of closure conversion and whatnot because LLVM is first order +2021-06-04 14:48:10 d_bot> Oleg seems to have some strong views on actually doing faithful compilation of CPS as well, along the lines of "whole-program continuations are never useful" and uh "the garbage collector doesn't like this" etc. paraphrasing (perhaps inaccurately) here +2021-06-04 14:48:21 zozozo> companion_cube: cmm has very minimal types (basically it says whether a value can/should be scanned) +2021-06-04 14:48:39 d_bot> Well, CPS as a compiler IL is a different storly to exposing continuations reified as functions +2021-06-04 14:48:42 companion_cube> yeah, that's not typing ;) +2021-06-04 14:49:20 companion_cube> but there you eliminate continuations again, right? towards some sort of static jump, like local exceptions? +2021-06-04 14:49:27 zozozo> yup +2021-06-04 14:49:38 zozozo> cmm has static jumps and flambda continuations maps perfectly to that +2021-06-04 14:49:50 zozozo> (ofc continuations that are used exactly once can be inlined) +2021-06-04 14:50:23 companion_cube> right +2021-06-04 14:50:32 d_bot> Either a return or a jump +2021-06-04 14:50:36 d_bot> this discussion is urging me to actually go and read Shivers' k-CFA stuff since I've always just avoided any real detail/proposed benefit of program transformations in CPS +2021-06-04 14:50:39 companion_cube> you can still use static jumps for patmathc and stuff +2021-06-04 14:50:54 d_bot> Or maybe an exception handler if double-barrelled CPS +2021-06-04 14:51:18 zozozo> flambda actually has double-barrelled CPS +2021-06-04 14:51:22 zozozo> (flambda2) +2021-06-04 14:51:47 d_bot> That makes sense, rather than duplicating all of the control constructs +2021-06-04 14:51:51 d_bot> And optims on them +2021-06-04 14:52:40 d_bot> what's double-barrelled, just doing the CPS twice? +2021-06-04 14:52:58 companion_cube> wait +2021-06-04 14:53:03 companion_cube> does the second handler also work for effects? +2021-06-04 14:53:10 companion_cube> or wolud there be a third handler? +2021-06-04 14:53:11 d_bot> Along with the usual return continuation you pass another continuation which is the error/exn path +2021-06-04 14:53:42 d_bot> ah +2021-06-04 14:54:19 zozozo> companion_cube: effects as in algebraic effects (cf multicore) ? +2021-06-04 14:54:29 companion_cube> yes +2021-06-04 14:54:34 companion_cube> runtime effects anyway +2021-06-04 14:54:38 companion_cube> the one shot continuations :) +2021-06-04 14:54:43 zozozo> that's a very good question +2021-06-04 14:55:21 companion_cube> I think exceptions will just be another effect, except in the type system, so you can probably only have 2 +2021-06-04 14:55:22 d_bot> who funds OCamlPro? INRIA? Jane Street? or is it its own company +2021-06-04 14:57:27 d_bot> I have a question about the change log of 4.13. The change "type check x |> f and f @@ x as (f x) ` is marked as breaking change. What are the consequences of that change actually? (sorry for interrupting a very interesting conversation) +2021-06-04 14:59:15 companion_cube> it might change a few things in a subtle way +2021-06-04 14:59:22 companion_cube> like `f x` can be `f ?a ?b x` +2021-06-04 14:59:26 companion_cube> if f has optional arguments +2021-06-04 14:59:43 zozozo> @colin : OCamlPro is its own company, and janestreet is one client of ocamlpro +2021-06-04 15:00:51 d_bot> Ah, I see, I was looking at compiler jobs at Jane Street (wishful thinking) but now they don't seem like they'd be as interesting as this flambda2 stuff (unless there's some ties between both companies) +2021-06-04 15:01:19 d_bot> Ah yes, I didn't think of optional arguments, thanks! +2021-06-04 15:01:37 companion_cube> aren't they funding flambda2? :D +2021-06-04 15:01:37 zozozo> @colin : well, the work on flambda2 is funded by JaneStreet, ^^ +2021-06-04 15:41:47 d_bot> type check of `x |> f` as `f x` is something I was not expecting but I really appreciate +2021-06-04 15:42:00 d_bot> now we need to type check `let x = y` in the opposite order +2021-06-04 15:43:25 d_bot> can we implement this kind of subtyping or would it be unsound? +2021-06-04 15:43:26 d_bot> ```ocaml +2021-06-04 15:43:27 d_bot> module X : sig +2021-06-04 15:43:28 d_bot> type 'a t = private 'a +2021-06-04 15:43:30 d_bot> end = struct +2021-06-04 15:43:31 d_bot> type 'a t = 'a +2021-06-04 15:43:32 d_bot> end +2021-06-04 15:43:34 d_bot> let add (a : int X.t) (b : int) = a + b +2021-06-04 15:43:35 d_bot> ``` +2021-06-04 16:03:27 d_bot> This is already implemented, with an explicit coercion as usual: `let add a b = (a:int X.t:>int) + b` +2021-06-04 19:56:48 hackinghorn> hi +2021-06-04 19:57:03 hackinghorn> how do I run commands like ls for linux in ocaml? +2021-06-04 19:59:38 dh`> there's a binding for system() somewhere +2021-06-04 19:59:40 hackinghorn> oh, fileutils work +2021-06-04 19:59:56 hackinghorn> got it, thanks +2021-06-04 23:15:51 d_bot> Why not implicit? +2021-06-04 23:20:48 companion_cube> There are no implicit coercions in ocaml +2021-06-04 23:51:53 d_bot> 👀 nice to see the stdlib increasingly fleshed out feels good +2021-06-05 00:39:14 companion_cube> like what? +2021-06-05 00:57:05 d_bot> like fold_left and fold_right with the strings +2021-06-05 00:57:12 d_bot> the math functions for floats +2021-06-05 01:05:15 companion_cube> Lolol ok +2021-06-05 01:05:33 companion_cube> Fold on string, heh? +2021-06-05 01:05:43 companion_cube> Forgot that that wasn't there +2021-06-05 01:06:10 d_bot> hey guy who wrote his own stdlib +2021-06-05 01:06:13 d_bot> it's pretty cool to me ok? +2021-06-05 07:50:23 companion_cube> :D it is, it is +2021-06-05 09:57:02 tane> howdy! found the way +2021-06-05 11:46:29 d_bot> anyone familiar with ocaml verification? termination in particular +2021-06-05 12:03:08 d_bot> small code or large projects? +2021-06-05 12:41:30 d_bot> small code +2021-06-05 13:02:29 companion_cube> @giga_08 you could give a look at try.imandra.ai (it's proprietary but termination checking is def. sth interesting) +2021-06-05 18:18:14 d_bot> learning ocaml and I occasionally giggle to myself because "O Caml! My Camel!" will pop into my head like a line from some sort of desert-themed walt whitman poem +2021-06-05 18:19:38 companion_cube> `my $camel` sounds more like perl, tbh +2021-06-05 18:21:20 d_bot> perls before swine +2021-06-05 23:22:45 kluk> how do I start using DynArray? I tried include DynArray, include Extlib, nothing works +2021-06-05 23:23:07 companion_cube> you need to have it in your dune file, if you use dune +2021-06-05 23:23:10 companion_cube> and to install it in the first place +2021-06-05 23:24:09 kluk> I don't know what dune is yet, I'm still a beginner at OCaml. how do I install DynArray? with opam right? +2021-06-05 23:24:51 companion_cube> hmmm if you're that beginner, maybe take a look at a book +2021-06-05 23:24:55 companion_cube> there's a lot to explain :/ +2021-06-05 23:26:43 kluk> I just wanted to play around on the ocaml repl with some arrays... not looking for making a project, folders, dune stuff, any of that, if possible to avoid at this point. Is it possible to just play with the OCaml language to learn it and not worry about how it mixes up with unix? +2021-06-05 23:27:56 companion_cube> ah well, sure, just type `ocaml` +2021-06-05 23:28:07 companion_cube> but Dynarray is a 3rd party library for vectors/resizable arrays +2021-06-05 23:28:16 companion_cube> it's not exactly a central type in OCaml : +2021-06-05 23:28:17 companion_cube> :p +2021-06-05 23:29:14 kluk> yes I can get to the repl, but I wanted to play with arrays first without worrying about packages, does that make sense? I wanted to explore OCaml the language first, like a try.ocaml.org sort of thing if that makes sense... I wanted to have some fun with the language and learn it and not have to think about packages and managing projects for a little +2021-06-05 23:30:40 kluk> I need a stack whose elements can be randomly accessed by an integer so I just happen to have an exact use case for arrays, but I am open to suggestions +2021-06-05 23:34:07 companion_cube> arrays are in the stdlib +2021-06-05 23:34:17 companion_cube> not dynamic arrays +2021-06-05 23:34:31 companion_cube> but yeah, a stack with indexing is a good use case +2021-06-05 23:34:55 kluk> companion_cube :) +2021-06-06 00:03:27 d_bot> IMO vectors should replace arrays as a primary data type in the language +2021-06-06 00:04:29 companion_cube> why "replace"? +2021-06-06 00:04:42 companion_cube> I think it'd be nice to be able to build them safely +2021-06-06 00:04:47 d_bot> as the *primary* data type +2021-06-06 00:04:51 companion_cube> but otherwise, they have some overhead +2021-06-06 00:05:03 companion_cube> arrays are simpler as they're always fully initialized +2021-06-06 00:05:11 d_bot> yeah the overhead is very minor though +2021-06-06 00:05:34 d_bot> very few languages have arrays as their primary data structure +2021-06-06 00:05:48 d_bot> python's lists are vectors +2021-06-06 00:05:49 companion_cube> I mean… java? +2021-06-06 00:06:12 companion_cube> I think the problem is the GC, because in a vector you need some unitialized space +2021-06-06 00:06:15 companion_cube> even in rust it's quite dirty +2021-06-06 00:06:46 d_bot> hmm +2021-06-06 00:07:10 companion_cube> it's hard to do well without a bit of Obj currently :/ +2021-06-06 00:08:53 d_bot> ok so I guess python/ruby's bias may be due to their reference counting +2021-06-06 00:09:11 companion_cube> also they're insanely high level and slow :p +2021-06-06 00:09:39 d_bot> yeah but that's beside the point. java has array, c# has array vs List (really a vector) +2021-06-06 00:09:54 companion_cube> java has ArrayList, but only for boxed types +2021-06-06 00:09:59 companion_cube> the primitive on the JVM is arrays, same as OCaml +2021-06-06 00:10:07 companion_cube> (except with unsound variance) +2021-06-06 00:10:12 d_bot> right +2021-06-06 00:10:30 d_bot> ok so yeah I think I'm just using python too much recently +2021-06-06 00:11:00 d_bot> javascript also has array as its primary type +2021-06-06 00:11:07 companion_cube> remember that in OCaml, an array is *one* word of overhead +2021-06-06 00:11:12 d_bot> so are python and ruby really the exceptions? +2021-06-06 00:11:26 companion_cube> as far as primitive types go? I'm not sure +2021-06-06 00:13:25 d_bot> JS arrays are dynamic arrays / vectors +2021-06-06 00:13:34 d_bot> and the implementation of it is really all over the place +2021-06-06 00:13:51 d_bot> perl has dynamic arrays. also reference counted +2021-06-06 00:14:18 companion_cube> _scripting languages_ were primitives are all in C +2021-06-06 00:15:51 d_bot> interesting. and it's gc'd. +2021-06-06 00:16:30 d_bot> @companion_cube GC is only an issue if you don't have a bit to tell the GC not to scan the uninitialized memory. If OCaml had it, it wouldn't be an issue. +2021-06-06 00:16:58 companion_cube> sure, if you entirely rewrite the GC so it's not just based on the initial tag… :p +2021-06-06 00:17:13 d_bot> but JS objects nowadays operates like OCaml blocks, adding and removing field is generally a bad idea because of the types, while it is possible that can trigger a whole lot of compiled and optimized code to be invalidated +2021-06-06 00:17:15 d_bot> hmm.. no I guess you need to build it into the GC process itself so it knows how to process the vector +2021-06-06 00:17:24 d_bot> so it looks at length vs capacity +2021-06-06 00:17:26 d_bot> well we can extend the object header +2021-06-06 00:17:26 companion_cube> (well for a vector you'd need to fit 2 sizes in one, basically: capacity, and actual size) +2021-06-06 00:17:35 d_bot> I'm looking on it during the shower +2021-06-06 00:17:52 d_bot> yeah a bit is not enough, you need to teach the GC about a new kind of object +2021-06-06 00:18:00 companion_cube> also remember that vectors are 2 levels of indirection, not one +2021-06-06 00:18:06 companion_cube> one to the {len,capacity,ptr} +2021-06-06 00:18:12 companion_cube> + the pointer itself +2021-06-06 00:18:31 companion_cube> but you've got to have this level of indirection so you can change the underlying array/pointer +2021-06-06 00:19:02 d_bot> that's true +2021-06-06 00:19:36 companion_cube> so that's non trivial overhead compared to a basic array, when all you need is an array +2021-06-06 00:19:53 d_bot> but that access can be mostly reduced if you know the cell size at compile time +2021-06-06 00:19:56 d_bot> the problem is that you very rarely need an array +2021-06-06 00:20:38 d_bot> if your primary type is a list, all an array gives you is mutability + O(1) access to any element. it's good, but the lack of ability to extend it is annoying +2021-06-06 00:20:46 d_bot> if you're doing mutable stuff, you almost always want to extend it +2021-06-06 00:20:56 companion_cube> idk, it's nice in ASTs for example +2021-06-06 00:21:03 companion_cube> I agree that often a vector is also useful +2021-06-06 00:22:19 d_bot> I wonder if having an unrolled linked list with some tricks wouldn't be enough for almost all cases +2021-06-06 00:22:53 companion_cube> for mutable stuff we just should have a good vector +2021-06-06 00:22:59 d_bot> like couple cells all cache aligned + pointers to additional cells if they were created all together so that you can do O(1) after a List.map +2021-06-06 00:23:03 companion_cube> for immutable stuff, we _could_ use HAMT… but well +2021-06-06 00:25:01 d_bot> copy on write is the solution to all problems +2021-06-06 00:25:11 companion_cube> noooo :D +2021-06-06 00:27:33 d_bot> computers are fun, nowadays you have an ALU and caching inside of the MMU +2021-06-06 00:28:05 d_bot> lisp machine to rule them all +2021-06-06 00:51:48 d_bot> companion_cube: what do you do to prevent the GC from scanning the uninitialized vector area? +2021-06-06 00:53:27 d_bot> If it is set to 0x0 the GC should just behave normally, it's a block of tag 0, size 0 +2021-06-06 00:57:50 companion_cube> @Bluddy in containers, indeed, I fill the vector with 0 +2021-06-06 00:58:03 companion_cube> or 0.0 if it's a float array 🙄 +2021-06-06 01:34:37 d_bot> ugh yeah that's bad +2021-06-06 01:34:57 companion_cube> not like we have a better option, imhp +2021-06-06 01:34:59 companion_cube> imho +2021-06-06 01:37:39 d_bot> I wonder what other languages do +2021-06-06 01:37:44 d_bot> ones with GC +2021-06-06 01:40:49 companion_cube> well, java fills with null I imagine +2021-06-06 01:40:54 companion_cube> boxed primitives and all that +2021-06-06 01:41:03 companion_cube> D… probably does ugly stuff? +2021-06-06 01:41:10 companion_cube> Go has 0 values for all types, so that's easy +2021-06-06 01:41:31 companion_cube> and the scripting stuff has nil/None/whatever to fill the blanks +2021-06-06 01:42:17 d_bot> at the Obj level it would be nice if you could have a contiguous array where the size is the length, and right after that you'd place a string header with the remaining size +2021-06-06 01:42:38 companion_cube> you'd have to move the header every time you push/pop? :/ +2021-06-06 01:42:48 d_bot> not a huge deal. same cache line +2021-06-06 01:43:07 companion_cube> ideally push should be as simple and inlineable as possible :p +2021-06-06 01:43:53 d_bot> still pretty simple. copy header over, reduce string size +2021-06-06 01:44:34 companion_cube> + code path for possible resize… that's a lot more than just a normal push +2021-06-06 01:44:37 d_bot> pop doesn't need to do anything because you can just zero data out at that point +2021-06-06 01:45:12 d_bot> that code path is there regardless +2021-06-06 01:45:38 d_bot> a multi-push function can be more efficient as it can do the header copy once +2021-06-06 01:45:59 companion_cube> pop still needs to copy the header back +2021-06-06 01:46:58 d_bot> yeah I guess that's true. the only annoying thing about the header is the size counter +2021-06-06 01:47:20 companion_cube> I'd rather wish OCaml had a primitive for partially initialized arrays, and that's it +2021-06-06 01:47:22 d_bot> but it should be doable with a couple of instructions +2021-06-06 01:47:43 d_bot> well that's not going to happen anytime soon +2021-06-06 01:48:23 d_bot> it can happen in the 64-bit runtime, but the 32-bit cannot handle it +2021-06-06 01:48:38 d_bot> because you need that extra header space for the size +2021-06-06 01:48:39 companion_cube> not sure how that's related :p +2021-06-06 01:49:03 companion_cube> I just want an API for the array with a valid 0 inside +2021-06-06 01:49:16 companion_cube> that doesn't force me to Obj.magic to see if it's a float array or normal array +2021-06-06 01:49:16 d_bot> valid 0? +2021-06-06 01:49:26 companion_cube> a valid object for this array +2021-06-06 01:49:42 companion_cube> a valid object for this array, _as seen by the GC_ +2021-06-06 01:51:38 d_bot> is this another wish? to deal more easily with float arrays? or is it related? +2021-06-06 01:51:58 companion_cube> it's related because it's the only reason I have to use Obj in containers :p +2021-06-06 01:52:04 companion_cube> (or one of the few, I can't remember) +2021-06-06 01:52:20 companion_cube> to be able to implement a vector +2021-06-06 01:52:39 d_bot> but it doesn't deal with this particular issue +2021-06-06 01:52:47 d_bot> I mean they're phasing out float arrays +2021-06-06 01:52:57 companion_cube> yeah that'll be nice +2021-06-06 01:53:16 companion_cube> without float arrays one could always fill the array with 0 +2021-06-06 01:53:29 companion_cube> since the GC doesn't mind 0 +2021-06-06 01:53:55 d_bot> yeah I see that piece of code now +2021-06-06 01:54:12 d_bot> let fill_with_junk_ (a:_ array) i len : unit = +2021-06-06 01:54:15 companion_cube> yep yep +2021-06-06 01:54:27 d_bot> https://github.com/c-cube/ocaml-containers/blob/95e96fb5e12558fa5b1e907a8e315d8c859c23b8/src/core/CCVector.ml#L27 +2021-06-06 01:54:29 companion_cube> always interested in better ideas +2021-06-06 02:04:20 d_bot> For 64-bit machine zero (not OCaml zero) is fine for float arrays as well +2021-06-06 02:05:07 d_bot> So you might be able to get away with coercing to `float array` and then filling with `0.0` +2021-06-06 02:05:26 d_bot> However, the recent `FloatArray` stuff might kill that idea +2021-06-06 02:08:30 d_bot> The no naked pointer changes might also be trouble +2021-06-06 03:32:21 d_bot> Hi everyone! I'm a 3rd-year CS student making personal explorations into programming languages with an emphasis on functional and concatenative languages, as well as metaprogramming and optimizing compilers. +2021-06-06 03:33:32 d_bot> I'm currently using OCaml to build a functional FORTH interpreter that I hope to shape into a general optimizing FORTH compiler +2021-06-06 03:33:49 d_bot> And right now I'm investigating to what extent I can express FORTH concepts in OCaml +2021-06-06 03:42:01 d_bot> Hmm, they're pretty different +2021-06-06 03:43:21 d_bot> OCaml code is very variable heavy, which seems to be at odds with the Forth philosophy of communicating between tiny bits with the stack +2021-06-06 03:43:38 d_bot> So, for example, inside my VM state is a list representing the current data stack. +2021-06-06 03:43:38 d_bot> ```ocaml +2021-06-06 03:43:40 d_bot> type state = { +2021-06-06 03:43:41 d_bot> ds : Int.t list; +2021-06-06 03:43:42 d_bot> (* ... *) +2021-06-06 03:43:44 d_bot> } +2021-06-06 03:43:45 d_bot> ``` +2021-06-06 03:43:46 d_bot> Stack-based interpreters are excellent matches for programming languages with pattern matching facilities, as it turns out. +2021-06-06 03:44:15 d_bot> ```ocaml +2021-06-06 03:44:16 d_bot> type opcode = +2021-06-06 03:44:17 d_bot> | Lit of Int.t +2021-06-06 03:44:19 d_bot> | Add +2021-06-06 03:44:20 d_bot> | Dot +2021-06-06 03:44:21 d_bot> (* ... *) +2021-06-06 03:44:23 d_bot> ``` +2021-06-06 03:44:41 d_bot> Let's define a small opcode set for our VM: push a literal to the stack, add the top two on the stack, and print the top on the stack (`Dot`) +2021-06-06 03:46:01 d_bot> Now, here's where OCaml's list matching becomes very elegant. Let's define a function, `execute`, that takes a state and an opcode and returns a new state that reflects having executed the opcode. +2021-06-06 03:46:01 d_bot> ```ocaml +2021-06-06 03:46:03 d_bot> let execute st = function +2021-06-06 03:46:04 d_bot> | Lit i -> { st with ds = i::st.ds } +2021-06-06 03:46:05 d_bot> | Add -> (* ... *) +2021-06-06 03:46:07 d_bot> | Dot -> (* ... *) +2021-06-06 03:46:08 d_bot> ``` +2021-06-06 03:46:32 d_bot> awaiting the IRC users who'll ask you to read the channel description +2021-06-06 03:46:43 d_bot> Aw shit 🤦‍♂️ +2021-06-06 03:46:49 d_bot> :p +2021-06-06 03:46:52 zozozo> @aotmr : code blocks from discord do not render great on the irc side of this channel, so it'd be best if you could use some paste website to link to code when there are more than a few lines, ^^ +2021-06-06 03:46:59 d_bot> There it is +2021-06-06 03:47:08 zozozo> haha, XD +2021-06-06 03:47:32 d_bot> Well all that goes to say +2021-06-06 03:47:32 d_bot> You can express stack operations using pattern matching. +2021-06-06 03:48:43 d_bot> if you think that's cute, you'll like a similar idea in dependent typing where you can express stack changes (as a list) indexing the opcodes or something similar +2021-06-06 03:48:44 d_bot> For example, to swap the top two items on the stack, you'd use the record update syntax +2021-06-06 03:48:45 d_bot> `{ st with ds = match st.ds with a:🅱️:tl -> b:🅰️:tl | _ -> assert false }` +2021-06-06 03:48:46 d_bot> Last code block for the time being, I promise 😅 +2021-06-06 03:49:17 d_bot> (And you can also use `let` matching, I've found, but I can't get ocaml to stop complaining even though I fully understand it'll crash if there aren't enough elements) +2021-06-06 03:49:30 d_bot> Oh, have a paper on that? +2021-06-06 03:49:54 d_bot> I'm wanting to see how high-level I can get with forth and still generate good code for small microprocessors--say, for NES and game boy dev +2021-06-06 03:50:06 d_bot> no, just thought it was very cute when I studied Agda at university, relevant construction of Hutton's razor can be found at https://github.com/fredrikNordvallForsberg/CS410-20/blob/master/Coursework/Two.agda#L492-L506 what you're saying just reminded me of it, not really relevant just in case you wanted to see cute things +2021-06-06 03:50:15 zozozo> @aotmr : small one-line blocks of code (like your last one) are mostly okay I'd say, ^^ +2021-06-06 03:50:48 d_bot> Oh I'll look at it never the less, thanks. +2021-06-06 03:50:49 d_bot> Forth has its own concept of combinators and I want to try to compile those efficiently +2021-06-06 03:52:04 d_bot> Honestly I'd say OCaml is distantly related to FORTH just usagewise, there's a similar concept of "pipelining". Where in FORTH you'd write a series of words, passing state between them implicitly on the stack, you do the same in Ocaml when expressing a `|>` or `@@` pipeline +2021-06-06 03:54:16 d_bot> This is an interesting idea as, while FORTH is typically untyped, I could use this concept to track the entire lifetimes of values throughout a program +2021-06-06 03:55:20 d_bot> it's just a nice encoding of how the stack ought to change, helps the type system help you implement it correctly (though not a full specification by any means, just a cute stack requirement) +2021-06-06 03:55:27 d_bot> There are some interesting typed concatenative langs +2021-06-06 03:55:47 d_bot> Kitten and Cat +2021-06-06 03:55:48 d_bot> I've finally taken the forth-pill so to speak because I finally understand how to implement a compiler for the language +2021-06-06 03:56:18 d_bot> a whole new world.mp3 https://llvm.moe/ +2021-06-06 03:56:29 d_bot> see past stack-based paradigm +2021-06-06 03:56:58 d_bot> Well, once I have a compiler for a stack-based VM that opens the door to using it as an intermediate representation +2021-06-06 03:57:14 d_bot> would there be any benefit +2021-06-06 03:57:27 d_bot> I, admittedly, have never seen the appeal of stack-based languages for general programming +2021-06-06 03:57:32 d_bot> I used to write postscript by hand recreationally +2021-06-06 03:57:35 d_bot> but that's about it +2021-06-06 03:57:46 d_bot> It's admittedly kind of recreational +2021-06-06 03:58:10 d_bot> I think the real strength is in the way you can build an entire system from the ground up by hand and know every moving part +2021-06-06 03:59:32 d_bot> You could write an optimizing compiler x86 in, oh, a month +2021-06-06 04:00:51 d_bot> sadly the majority of back-end optimisations for x86 are really just suffering +2021-06-06 04:00:59 d_bot> OCaml's own VM is stack-based so it's kind of circular +2021-06-06 04:01:09 d_bot> yeah but that's just the bytecode OCaml stuff +2021-06-06 04:01:12 d_bot> Oh yeah no x86 is a horrible architecture to program for +2021-06-06 04:01:19 d_bot> Sure but it's still a neat thought +2021-06-06 04:01:25 d_bot> But I digress +2021-06-06 04:01:28 d_bot> I used to be confused as to why Xavier Leroy's earlier work seemed to focus rather specifically on bytecode stack machines as the target of Camls +2021-06-06 04:01:51 d_bot> but then someone said like "it was research into creating a tactic computational kernel for some proof assistant" +2021-06-06 04:02:01 d_bot> not sure how true that is, perhaps someone here can clarify if that's nonsense +2021-06-06 04:02:07 d_bot> and Xavier just really likes stack machines +2021-06-06 04:02:56 d_bot> So, it could be that you can take advantage of immutable VM states in unit testing +2021-06-06 04:03:13 d_bot> And using it to accelerate the general process +2021-06-06 04:04:16 d_bot> If you wanted to do an exhaustive search of the program P with inputs a, b, c..., you could run P over every possible value of a, b, c +2021-06-06 04:05:19 d_bot> That is, we're trying to find a, b, c... that causes P to fail +2021-06-06 04:06:00 d_bot> There's actually some tooling for that +2021-06-06 04:06:02 d_bot> See Crowbar +2021-06-06 04:06:08 d_bot> One way to speed up that process is to memoize the VM state, I think +2021-06-06 04:06:44 d_bot> It's not exhaustive search, but coverage-feedback guided random generation +2021-06-06 04:06:47 d_bot> If we find a "success" set of (a, b, c...), we could maybe remember all of the previous states of the VM and if we ever encounter them again we can stop early +2021-06-06 04:07:14 d_bot> But that would blow up your space requirements for little speedup, I'd think +2021-06-06 04:07:17 d_bot> can see why that'd help (as a form of concolic execution) but I think the accepted reality in industry is that Google fuzz their own software over billions of instances using AFL on dozens of Google cloud instances and just consider that alright +2021-06-06 04:08:00 d_bot> My other use case is of a rewindable debugger where you can undo all the way back to the start of the program +2021-06-06 04:08:51 d_bot> time travel debugging is pretty cool +2021-06-06 04:09:07 d_bot> That also brings to mind the idea of a rewindable game engine, I think rewind mechanics are pretty cool in theory +2021-06-06 04:09:12 d_bot> I always wanted a clean injection mechanism for debugging +2021-06-06 04:09:27 d_bot> hot reloading debugging stubs, that kinda thing +2021-06-06 04:09:54 d_bot> I'm still not entirely familiar with the mechanics of debuggers +2021-06-06 04:10:07 d_bot> syscalls and suffering™️ +2021-06-06 04:10:36 d_bot> I'm under the impression that, if you can execute from RAM, you can at least single-step on pretty much any CPU +2021-06-06 04:11:58 d_bot> yeah there's architectural single step stuff provided by most systems; *nix has PTRACE_SINGLESTEP +2021-06-06 04:12:02 d_bot> If you want to single-step the instruction at a given address, then you'd write some kind of "breakpoint" opcode (or, crudely, even just an absolute jump) directly following it, but you'd have to know the length of the opcode beforehand +2021-06-06 04:12:27 d_bot> But I'd hope consumer CPUs can single-step in silicon by now 😅 +2021-06-06 04:12:28 d_bot> variable length encoding is just one part of suffering in writing x86(_64) tooling, yes +2021-06-06 04:12:42 d_bot> Oh yeah I guess debugging has to be infinitely easier on a fixed-length RISC +2021-06-06 04:13:14 d_bot> Imagine if x86 had an instruction that only decoded the length of an instruction at a given address +2021-06-06 04:13:18 d_bot> I suppose there's other challenges, given the domain where RISC microprocessors are probably most prevalently being debugged +2021-06-06 04:13:39 d_bot> who knows, they might, Intel has a ton of hidden instructions and their manual doesn't even document some of them accurately +2021-06-06 04:13:46 d_bot> You're right, there probably is. +2021-06-06 04:14:06 d_bot> There's tons of hardware support for debugging +2021-06-06 04:14:09 d_bot> it's common for trampoline hooking code to come with a "variable length decoder" as a form of minimal disassembler +2021-06-06 04:14:13 d_bot> Watch registers and that kind of thing +2021-06-06 04:14:26 d_bot> Pretty complicated from what I understand +2021-06-06 04:14:27 d_bot> to know how many bytes to replace w/ their placed `jmp` or `push ...; ret` etc. +2021-06-06 04:16:26 d_bot> but yeah, can't lie +2021-06-06 04:16:34 d_bot> confused how we went from stack langs to all this +2021-06-06 04:16:58 d_bot> what is your ambition, aotmr, to write a forth interpreter/compiler? +2021-06-06 04:19:34 d_bot> Just to do it, I guess. I think it's interesting to build a software stack nearly from the bottom up--or nearly so +2021-06-06 04:19:53 d_bot> what, in Forth? +2021-06-06 04:20:04 d_bot> I mean, build a Forth itself from the bottom up +2021-06-06 04:20:14 d_bot> oh alright +2021-06-06 04:20:29 d_bot> In theory it can even be possible to replace the Ocaml parts with Forth themselves +2021-06-06 04:21:15 d_bot> Though "bootstrapping" +2021-06-06 04:21:47 d_bot> First, I'd write a forth compiler in ocaml +2021-06-06 04:22:07 d_bot> Then, translate the compiler to forth +2021-06-06 04:22:17 d_bot> Compile the compiler-in-forth with the compiler-in-ocaml +2021-06-06 04:22:30 d_bot> And then I have a forth compiler, compiled and written in forth +2021-06-06 04:22:36 d_bot> can graduate to something hacky like JITing the FORTH then using C FFI to map the code and somehow return opaque caml values back to the user as callables within OCaml +2021-06-06 04:22:55 d_bot> galaxy brain interplay +2021-06-06 04:23:14 d_bot> That sounds terrifying +2021-06-06 04:23:22 d_bot> -ly based +2021-06-06 04:23:28 d_bot> You got it +2021-06-06 04:23:44 d_bot> don't actually know if you can do that +2021-06-06 04:23:52 d_bot> on the conceptual level, you certainly can with enough hacks +2021-06-06 04:24:21 d_bot> Probably the easiest way to "JIT" stack code is just to apply peephole optimization +2021-06-06 04:24:34 d_bot> can't lie, I hate stacks +2021-06-06 04:24:56 d_bot> The compiler writer writes manual superwords that implement a series of smaller words in a faster way +2021-06-06 04:26:26 d_bot> For example, replacing `>r + r>` with the much shorter machine code for the equivalent sequence that just adds the top element of the stack to the third +2021-06-06 04:42:07 d_bot> I’m just marinating in all these high level ideas +2021-06-06 05:58:42 ralu> I am trying to build infer, but I keep getting error about failed dune build. So i can not build dune. Has anyone has any pointers? +2021-06-06 09:38:22 d_bot> What if we make it so a proper null pointer inside an array means the end of GC scanning? +2021-06-06 10:32:24 d_bot> @Bluddy that's not compatible with a bunch of much more interesting representations improvements (like democratizing the Zarith hack, for instance) +2021-06-06 10:52:39 d_bot> Would lightweight higher kinded types be added to ocaml in the future? +2021-06-06 10:58:32 d_bot> With modular immlicits I guess that lightweight higher kinded types will be less useful +2021-06-06 11:08:02 d_bot> There's a nice chapter on that idea in CPDT: +2021-06-06 11:08:04 d_bot> @Drup could you explain the 'zarith hack'? +2021-06-06 11:09:03 d_bot> But they are still years away as I understand? +2021-06-06 11:09:35 d_bot> @Bluddy A value of type `Z.t` in zarith is either a normal ocaml integer (63bits usually, etc) or a GMP "big integers" +2021-06-06 11:11:56 d_bot> This is achieved by considering the type morally as `int | Big of gmp`. OCaml integers already have a bit put aside for the GC to differentiate them from pointers, so we don't need an extra tag to differentiate between small integers and pointers to a big integer. +2021-06-06 11:12:15 d_bot> This is only possible by going through the C FFI +2021-06-06 11:12:29 d_bot> Machine zero isn't an `int` or a block though +2021-06-06 11:15:09 d_bot> @ggole I can never remember if the tag for integers is 0 or 1. +2021-06-06 11:17:58 d_bot> It's 1 +2021-06-06 11:18:24 d_bot> But even if it were zero, you could set aside a non-valid pointer value to indicate a truncation spot +2021-06-06 11:20:59 d_bot> right, I'm not sure how much I like it, but it could work +2021-06-06 11:26:30 d_bot> I guess there would have to be an `Array.unsafe_set_terminator` or something, which would be a bit nasty +2021-06-06 11:26:41 d_bot> And I dunno what the interaction with bounds checking would be +2021-06-06 11:27:07 d_bot> I suspect they would be more trouble than the terminator value itself though +2021-06-06 11:49:23 d_bot> I need to try it out and see the performance difference. +2021-06-06 11:51:38 d_bot> it's not automatically clear that setting all the memory is a bad idea +2021-06-06 13:00:48 companion_cube> I'd just like to point out that no one else uses a terminator for vectors, afaik +2021-06-06 13:00:55 companion_cube> it seems like a pretty bad idea :p +2021-06-06 13:05:10 d_bot> Most of the other langs with vectors can handle uninitialised memory or keep the bits there without leaks +2021-06-06 13:06:34 companion_cube> and again, it's not that common +2021-06-06 13:06:57 companion_cube> languages that compile to native and have a GC and don't rely on C to implement a ton of datastructures are not plenty +2021-06-06 13:47:15 d_bot> I'm still not entirely used to building data structures in any language *but* C, to be honest--it feels strange +2021-06-06 13:47:52 d_bot> I probably just don't have practice because C is the only language that I use that doesn't have a dynamic array, really +2021-06-06 13:48:49 companion_cube> well OCaml is excellent for implementing a lot of data structures +2021-06-06 13:49:01 companion_cube> vectors just happen to be a bit on the low-level, unsafe memory thingie side +2021-06-06 13:51:37 d_bot> What's a good way to map from a discriminated union to successive integers? +2021-06-06 13:51:43 d_bot> And the other way around? +2021-06-06 13:53:53 companion_cube> ppx_deriving.enum maybe? +2021-06-06 13:54:02 companion_cube> if it's an enum, without payload on the variants, that is. +2021-06-06 13:57:14 d_bot> Hmm +2021-06-06 13:57:14 d_bot> Here's a simpler question: how do I get the "tag" of a sum type? +2021-06-06 13:57:41 companion_cube> you don't :) +2021-06-06 13:57:45 d_bot> I figure I can quickly map integers to most of the opcodes and then manually handle opcodes with a payload +2021-06-06 13:57:47 companion_cube> it's not really specified in the language. +2021-06-06 13:57:48 d_bot> Oh... +2021-06-06 13:57:52 d_bot> The simpler and most forward-compatible way is to write the function. +2021-06-06 13:58:11 d_bot> True, but then I'd have to write two functions and keep them in sync manually, or generate the code. +2021-06-06 13:58:35 d_bot> *sigh* Okay then +2021-06-06 13:58:37 companion_cube> the function from integers to variants seems impossible to write +2021-06-06 13:58:45 companion_cube> if they have payloads that is +2021-06-06 13:58:56 d_bot> I'd be converting from a packed representation +2021-06-06 14:01:41 companion_cube> your best chance is codegen indeed +2021-06-06 14:01:53 companion_cube> variant to int: generate a pattern matching function +2021-06-06 14:02:10 companion_cube> int+payload to variant: well, match on the int I guess +2021-06-06 14:04:58 d_bot> Actually wait, I'm wrong +2021-06-06 14:04:58 d_bot> I shouldn't have written the VM with a discriminated union like this anyways +2021-06-06 14:05:13 d_bot> But, I guess I might as well keep a separate encoded and decoded form +2021-06-06 14:10:07 companion_cube> a VM seems like a good use case for C or C++ or rust, ironically +2021-06-06 14:23:33 d_bot> Oh it's definitely more appropriate, but I'm actually making some headway +2021-06-06 14:24:11 d_bot> I haven't played with ocaml in quite some time (OS issues--it didn't work well on Windows for me until quite recently) +2021-06-06 14:24:23 companion_cube> glad to hear it works better now +2021-06-06 14:24:45 d_bot> I mean, it works better now because it's running in WSL 😆 +2021-06-06 14:25:44 d_bot> So I'm happy that I remember how to build list to list mappings that produce and consume varying numbers of elements +2021-06-06 15:08:24 d_bot> Cool, so I've figured out how to build an encoder and decoder for a variable-length instruction stream +2021-06-06 18:00:25 kluk> I get "Error: Unbound module Batteries" after doing open Batteries;; on the ocaml repl after having done opam install batteries. what am I missing? +2021-06-06 18:04:03 companion_cube> #require "batteries";; +2021-06-06 18:04:12 companion_cube> (and possibly, before that, #use "topfind";;) +2021-06-06 18:07:13 kluk> Ahhh.. it wasn't clear to me that #use was needed to bring #require but now that I ran it I can see in its blurb that it does do that. Thank you very much. +2021-06-06 18:07:49 companion_cube> also note that if you use `utop` it does the topfind thing directly +2021-06-06 18:08:03 companion_cube> you can also put the blurb in ~/.ocamlinit +2021-06-06 18:11:31 kluk> companion_cube thank you for the .ocamlinit tip +2021-06-06 18:27:10 kluk> companion_cube so now I can use DynArray from Batteries just fine :) thanks so much for the help once again. +2021-06-06 18:35:30 companion_cube> heh From 1450b869f9c56235ca0b44ef57f717b196a58e62 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sun, 6 Jun 2021 23:23:46 -0400 Subject: [PATCH 16/24] try to fix ci --- examples/dune | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/dune b/examples/dune index e2a6a711..0be3cc1e 100644 --- a/examples/dune +++ b/examples/dune @@ -10,6 +10,7 @@ (alias (name runtest) (locks /ctest) + (deps (source_tree test_data)) (action (ignore-stdout (run ./id_sexp.exe test_data/benchpress.sexp)))) @@ -17,6 +18,7 @@ (alias (name runtest) (locks /ctest) + (deps (source_tree test_data)) (action (ignore-stdout (run ./ccparse_sexp.exe test_data/benchpress.sexp)))) @@ -24,6 +26,7 @@ (alias (name runtest) (locks /ctest) + (deps (source_tree test_data)) (action (ignore-stdout (run ./ccparse_irclogs.exe test_data/irc-logs.txt)))) From 5c67fb51ab1389f07eebbee23da35824199438be Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Mon, 7 Jun 2021 00:10:20 -0400 Subject: [PATCH 17/24] try to fix ci --- examples/dune | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/dune b/examples/dune index 0be3cc1e..2b0405bc 100644 --- a/examples/dune +++ b/examples/dune @@ -27,6 +27,7 @@ (name runtest) (locks /ctest) (deps (source_tree test_data)) + (enabled_if (>= %{ocaml_version} "4.08")) (action (ignore-stdout (run ./ccparse_irclogs.exe test_data/irc-logs.txt)))) From b9828375e1997f3747890ce524c33521e549b08e Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Mon, 7 Jun 2021 00:24:39 -0400 Subject: [PATCH 18/24] use dune 1.4 and build ccparse_irclogs only on >= 4.08 --- containers-data.opam | 2 +- containers-thread.opam | 2 +- containers.opam | 2 +- dune-project | 2 +- ...se_irclogs.ml => ccparse_irclogs_real.cond.ml} | 0 examples/dune | 15 +++++++++++---- 6 files changed, 15 insertions(+), 8 deletions(-) rename examples/{ccparse_irclogs.ml => ccparse_irclogs_real.cond.ml} (100%) diff --git a/containers-data.opam b/containers-data.opam index 281a00e6..f50e954d 100644 --- a/containers-data.opam +++ b/containers-data.opam @@ -11,7 +11,7 @@ build: [ ] depends: [ "ocaml" { >= "4.03.0" } - "dune" { >= "1.1" } + "dune" { >= "1.4" } "containers" { = version } "seq" "qtest" { with-test } diff --git a/containers-thread.opam b/containers-thread.opam index d0d4baab..a2d3dd0f 100644 --- a/containers-thread.opam +++ b/containers-thread.opam @@ -11,7 +11,7 @@ build: [ ] depends: [ "ocaml" { >= "4.03.0" } - "dune" { >= "1.1" } + "dune" { >= "1.4" } "base-threads" "dune-configurator" "containers" { = version } diff --git a/containers.opam b/containers.opam index 14002208..7da8de9a 100644 --- a/containers.opam +++ b/containers.opam @@ -12,7 +12,7 @@ build: [ ] depends: [ "ocaml" { >= "4.03.0" } - "dune" { >= "1.1" } + "dune" { >= "1.4" } "dune-configurator" "seq" "qtest" { with-test } diff --git a/dune-project b/dune-project index 7655de07..f9337290 100644 --- a/dune-project +++ b/dune-project @@ -1 +1 @@ -(lang dune 1.1) +(lang dune 1.4) diff --git a/examples/ccparse_irclogs.ml b/examples/ccparse_irclogs_real.cond.ml similarity index 100% rename from examples/ccparse_irclogs.ml rename to examples/ccparse_irclogs_real.cond.ml diff --git a/examples/dune b/examples/dune index 2b0405bc..13b6e613 100644 --- a/examples/dune +++ b/examples/dune @@ -2,10 +2,7 @@ (executables (names id_sexp ccparse_sexp ccparse_irclogs) (libraries containers) - ;(modules id_sexp) - (flags :standard -warn-error -a+8 -safe-string -color always) - (ocamlopt_flags :standard -O3 -color always - -unbox-closures -unbox-closures-factor 20)) + (flags :standard -warn-error -a+8)) (alias (name runtest) @@ -23,6 +20,16 @@ (ignore-stdout (run ./ccparse_sexp.exe test_data/benchpress.sexp)))) +(rule + (targets ccparse_irclogs.ml) + (enabled_if (>= %{ocaml_version} "4.08")) + (action (copy ccparse_irclogs_real.cond.ml %{targets}))) + +(rule + (targets ccparse_irclogs.ml) + (enabled_if (< %{ocaml_version} "4.08")) + (action (with-stdout-to %{targets} (run echo "let() = print_endline {|ok|}")))) + (alias (name runtest) (locks /ctest) From 8f9ecf5f41be1c798e7e2ab339a8296215be7308 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Fri, 24 Sep 2021 22:45:17 -0400 Subject: [PATCH 19/24] some fixes related to review --- src/core/CCParse.ml | 4 ++-- src/core/CCParse.mli | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index d83b0f35..559870af 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -531,8 +531,8 @@ let chars_fold ~f acc0 = { | `Continue acc' -> incr i; acc := acc' - | `Stop -> continue := false; - | `Consume_and_stop -> incr i; continue := false + | `Stop a -> acc := a; continue := false; + | `Consume_and_stop a -> acc := a; incr i; continue := false | `Fail msg -> raise (Fold_fail (st,msg)) ) done; diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 1238a924..6b575652 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -151,11 +151,8 @@ val ap : ('a -> 'b) t -> 'a t -> 'b t val eoi : unit t (** Expect the end of input, fails otherwise. *) -val nop : unit t -(** Succeed with [()]. *) - val empty : unit t -(** Succeed with [()], same as {!nop}. +(** Succeed with [()]. @since NEXT_RELEASE *) val fail : string -> 'a t @@ -256,16 +253,16 @@ val set_current_slice : slice -> unit t val chars_fold : f:('acc -> char -> - [`Continue of 'acc | `Consume_and_stop | `Stop | `Fail of string]) -> + [`Continue of 'acc | `Consume_and_stop of 'acc | `Stop of 'acc | `Fail of string]) -> 'acc -> ('acc * slice) t (** [chars_fold f acc0] folds over characters of the input. Each char [c] is passed, along with the current accumulator, to [f]; [f] can either: - - stop, by returning [`Stop]. In this case the current accumulator + - stop, by returning [`Stop acc]. In this case the final accumulator [acc] is returned, and [c] is not consumed. - - consume char and stop, by returning [`Consume_and_stop]. + - consume char and stop, by returning [`Consume_and_stop acc]. - fail, by returning [`Fail msg]. In this case the parser fails with the given message. - continue, by returning [`Continue acc]. The parser continues to the From 396a7db96708b97b4dc06136a355b2b3ebbd185a Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Fri, 24 Sep 2021 22:58:08 -0400 Subject: [PATCH 20/24] more fixes --- src/core/CCOrd.mli | 2 +- src/core/CCParse.mli | 46 ++++++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/src/core/CCOrd.mli b/src/core/CCOrd.mli index e2f9454d..11c816ba 100644 --- a/src/core/CCOrd.mli +++ b/src/core/CCOrd.mli @@ -19,7 +19,7 @@ val compare : 'a t generally not very descriptive. *) val opp : 'a t -> 'a t -(** Opposite order. For example, [opp a b < 0] iff [opp b a > 0]. +(** Opposite order. For example, [opp cmp a b < 0] iff [cmp b a > 0]. This can be used to sort values in the opposite order, among other things. *) val equiv : int -> int -> bool diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 6b575652..fe9b0b9c 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -200,7 +200,7 @@ val char : char -> char t type slice (** A slice of the input, as returned by some combinators such - as {!split_1} or {split_list} or {!take}. + as {!split_1} or {!split_list} or {!take}. The idea is that one can use some parsers to cut the input into slices, e.g. split into lines, or split a line into fields (think CSV or TSV). @@ -272,6 +272,7 @@ val chars_fold : characters on the fly, skip some, handle escape sequences, etc. It can also be useful as a base component for a lexer. + @return a pair of the final accumular, and the slice matched by the fold. @since NEXT_RELEASE *) val chars_fold_transduce : @@ -291,7 +292,7 @@ val chars_fold_transduce : @since NEXT_RELEASE *) val take : int -> slice t -(** [slice_of_len len] parses exactly [len] characters from the input. +(** [take len] parses exactly [len] characters from the input. Fails if the input doesn't contain at least [len] chars. @since NEXT_RELEASE *) @@ -302,21 +303,22 @@ val take_if : (char -> bool) -> slice t val take1_if : ?descr:string -> (char -> bool) -> slice t (** [take1_if f] takes characters as long as they satisfy the predicate [f]. Fails if no character satisfies [f]. + @param descr describes what kind of character was expected, in case of error @since NEXT_RELEASE *) val char_if : ?descr:string -> (char -> bool) -> char t (** [char_if f] parses a character [c] if [f c = true]. Fails if the next char does not satisfy [f]. - @param descr describes what kind of character was expected *) + @param descr describes what kind of character was expected, in case of error *) val chars_if : (char -> bool) -> string t (** [chars_if f] parses a string of chars that satisfy [f]. Cannot fail. *) val chars1_if : ?descr:string -> (char -> bool) -> string t -(** Like {!chars_if}, but only non-empty strings. - Fails if the string is empty. - @param descr describes what kind of character was expected *) +(** Like {!chars_if}, but accepts only non-empty strings. + [chars1_if p] fails if the string accepted by [chars_if p] is empty. + @param descr describes what kind of character was expected, in case of error *) val endline : char t (** Parse '\n'. *) @@ -440,7 +442,9 @@ val many1 : 'a t -> 'a list t list is empty (i.e. it needs [p] to succeed at least once). *) val skip : _ t -> unit t -(** [skip p] parses zero or more times [p] and ignores its result. *) +(** [skip p] parses zero or more times [p] and ignores its result. + It is eager, meaning it will continue as long as [p] succeeds. + As soon as [p] fails, [skip p] stops consuming any input. *) val sep : by:_ t -> 'a t -> 'a list t (** [sep ~by p] parses a list of [p] separated by [by]. *) @@ -462,7 +466,7 @@ val lookahead_ignore : 'a t -> unit t (** [lookahead_ignore p] tries to parse input with [p], and succeeds if [p] succeeds. However it doesn't consume any input and returns [()], so in effect its only use-case is to detect - whether [p] succeeds, e.g. in {!cond}. + whether [p] succeeds, e.g. in {!try_or_l}. {b EXPERIMENTAL} @since NEXT_RELEASE *) @@ -477,7 +481,7 @@ val line : slice t val line_str : string t (** [line_str] is [line >|= Slice.to_string]. It parses the next line and turns the slice into a string. - The state points to after the ['\n'] character. + The state points to the character immediately after the ['\n'] character. @since NEXT_RELEASE *) val each_line : 'a t -> 'a list t @@ -491,8 +495,10 @@ val split_1 : on_char:char -> (slice * slice option) t - [sl1] is the slice of the input the precedes the first occurrence of [on_char], or the whole input if [on_char] cannot be found. + It does not contain [on_char]. - [sl2] is the slice that comes after [on_char], - or [None] if [on_char] couldn't be found. + or [None] if [on_char] couldn't be found. It doesn't contain the first + occurrence of [on_char] (if any). The parser is now positioned at the end of the input. @@ -500,12 +506,9 @@ val split_1 : on_char:char -> (slice * slice option) t @since NEXT_RELEASE *) val split_list : on_char:char -> slice list t -(** [split_n ~on_char] splits the input on all occurrences of [on_char], +(** [split_list ~on_char] splits the input on all occurrences of [on_char], returning a list of slices. - A useful specialization of this is {!each_line}, which is - basically [split_n ~on_char:'\n' p]. - {b EXPERIMENTAL} @since NEXT_RELEASE *) @@ -544,6 +547,9 @@ val each_split : on_char:char -> 'a t -> 'a list t While it is more flexible, this technique also means [p] has to be careful not to consume [on_char] by error. + A useful specialization of this is {!each_line}, which is + basically [each_split ~on_char:'\n' p]. + {b EXPERIMENTAL} @since NEXT_RELEASE *) @@ -558,7 +564,7 @@ val all : slice t val all_str : string t (** [all_str] accepts all the remaining chars and extracts them into a - string. Similar to {!rest_of_input} but with a string. + string. Similar to {!all} but with a string. {b EXPERIMENTAL} @since NEXT_RELEASE *) @@ -625,7 +631,7 @@ module Infix : sig val () : 'a t -> string -> 'a t (** [a msg] behaves like [a], but if [a] fails, - [a msg] fails with [msg] instead. Useful as the last choice in a series of [<|>]. For example: [a <|> b <|> c "expected one of a, b, c"]. *) @@ -675,7 +681,7 @@ val parse_file_exn : 'a t -> string -> 'a module U : sig val list : ?start:string -> ?stop:string -> ?sep:string -> 'a t -> 'a list t (** [list p] parses a list of [p], with the OCaml conventions for - start token "[", stop token "]" and separator ";". + start token "\[", stop token "\]" and separator ";". Whitespace between items are skipped. *) (* TODO: parse option? *) @@ -707,7 +713,8 @@ module U : sig (** Non empty string of alpha num, start with alpha. *) val bool : bool t - (** Accepts "true" or "false" *) + (** Accepts "true" or "false" + @since NEXT_RELEASE *) (* TODO: quoted string *) @@ -723,7 +730,8 @@ module U : sig end (** Debugging utils. - {b EXPERIMENTAL} *) + {b EXPERIMENTAL} + @since NEXT_RELEASE *) module Debug_ : sig val trace_fail : string -> 'a t -> 'a t (** [trace_fail name p] behaves like [p], but prints the error message of [p] From e06cd516f078ef4888e41a9226359c4236c96cae Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sat, 25 Sep 2021 15:38:35 -0400 Subject: [PATCH 21/24] detail --- src/core/CCParse.mli | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index fe9b0b9c..94c5875a 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -384,8 +384,9 @@ val try_ : 'a t -> 'a t to detect failure. *) val try_opt : 'a t -> 'a option t -(** [try_ p] tries to parse using [p], and return [Some x] if [p] - succeeded with [x]. Otherwise it returns [None]. This cannot fail. +(** [try_opt p] tries to parse using [p], and return [Some x] if [p] + succeeded with [x] (and consumes what [p] consumed). + Otherwise it returns [None] and consumes nothing. This cannot fail. @since NEXT_RELEASE *) val many_until : until:_ t -> 'a t -> 'a list t From b8fa4004653aec6807a097afdf7071574db55801 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sat, 25 Sep 2021 20:47:26 -0400 Subject: [PATCH 22/24] a test to ensure `chars1_if p = take1_if p >|= Slice.to_string` --- src/core/CCParse.ml | 13 +++++++++++++ src/core/CCParse.mli | 1 + 2 files changed, 14 insertions(+) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index 559870af..d3d55f55 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -512,6 +512,19 @@ let chars1_if ?descr p = { ~err } +(*$QR + Q.(printable_string) (fun s -> + let pred = (function 'a'..'z' | 'A' .. 'Z' | '{' | '}' -> true | _ -> false) in + let p1 = chars1_if pred in + let p2 = take1_if pred >|= Slice.to_string in + parse_string p1 s = parse_string p2 s) + *) + +(*$T + let pred = (function 'a'..'z' | 'A' .. 'Z' | '{' | '}' -> true | _ -> false) in \ + parse_string (chars_if pred) "coucou{lol} 123" = Ok "coucou{lol}" +*) + exception Fold_fail of state * string let chars_fold ~f acc0 = { diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 94c5875a..1bcdd4b7 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -318,6 +318,7 @@ val chars_if : (char -> bool) -> string t val chars1_if : ?descr:string -> (char -> bool) -> string t (** Like {!chars_if}, but accepts only non-empty strings. [chars1_if p] fails if the string accepted by [chars_if p] is empty. + [chars1_if p] is equivalent to [take1_if p >|= Slice.to_string]. @param descr describes what kind of character was expected, in case of error *) val endline : char t From 938c7cb90abddb36f31e042c3e8501485c03f0d9 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sat, 25 Sep 2021 20:50:04 -0400 Subject: [PATCH 23/24] more doc --- src/core/CCParse.mli | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 1bcdd4b7..78f01389 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -394,7 +394,8 @@ val many_until : until:_ t -> 'a t -> 'a list t (** [many_until ~until p] parses as many [p] as it can until the [until] parser successfully returns. If [p] fails before that then [many_until ~until p] fails as well. - Typically [until] can be a closing ')' or another termination condition. + Typically [until] can be a closing ')' or another termination condition, + and what is consumed by [until] is also consumed by [many_until ~until p]. {b EXPERIMENTAL} From 9c7279751514d90fbdee5cc86b5a26752fe5cf3c Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sat, 25 Sep 2021 20:51:18 -0400 Subject: [PATCH 24/24] minor changes --- src/core/CCParse.ml | 4 ++-- src/core/CCParse.mli | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/CCParse.ml b/src/core/CCParse.ml index d3d55f55..465e9f76 100644 --- a/src/core/CCParse.ml +++ b/src/core/CCParse.ml @@ -657,12 +657,12 @@ let take len : slice t = { ) } -let any_chars len : _ t = take len >|= Slice.to_string +let any_char_n len : _ t = take len >|= Slice.to_string let exact s = { run=fun st ~ok ~err -> (* parse a string of length [String.length s] and compare with [s] *) - (any_chars (String.length s)).run st + (any_char_n (String.length s)).run st ~ok:(fun st s2 -> if string_equal s s2 then ok st s else ( diff --git a/src/core/CCParse.mli b/src/core/CCParse.mli index 78f01389..274d4852 100644 --- a/src/core/CCParse.mli +++ b/src/core/CCParse.mli @@ -190,8 +190,8 @@ val any_char : char t It still fails if the end of input was reached. @since NEXT_RELEASE *) -val any_chars : int -> string t -(** [any_chars len] parses exactly [len] characters from the input. +val any_char_n : int -> string t +(** [any_char_n len] parses exactly [len] characters from the input. Fails if the input doesn't contain at least [len] chars. @since NEXT_RELEASE *)