diff --git a/src/core/CCSexp.ml b/src/core/CCSexp.ml index 1c75df73..9a2b69b8 100644 --- a/src/core/CCSexp.ml +++ b/src/core/CCSexp.ml @@ -8,12 +8,14 @@ type 'a or_error = ('a, string) result type 'a gen = unit -> 'a option module type SEXP = CCSexp_intf.SEXP +module type BASIC_SEXP = CCSexp_intf.BASIC_SEXP module type S = CCSexp_intf.S +module type S0 = CCSexp_intf.S0 let equal_string (a : string) b = Stdlib.(=) a b let compare_string (a : string) b = Stdlib.compare a b -module Make(Sexp : SEXP) = struct +module MakeBasic(Sexp : BASIC_SEXP) = struct type t = Sexp.t type sexp = t @@ -33,18 +35,32 @@ module Make(Sexp : SEXP) = struct let of_field name t = Sexp.list [Sexp.atom name; t] let of_record l = Sexp.list (List.map (fun (n,x) -> of_field n x) l) +end - (** {2 Printing} *) +let _with_in filename f = + let ic = open_in filename in + try + let x = f ic in + close_in ic; + x + with e -> + close_in ic; + Error (Printexc.to_string e) - let _with_out filename f = - let oc = open_out filename in - try - let x = f oc in - close_out oc; - x - with e -> - close_out oc; - raise e +let _with_out filename f = + let oc = open_out filename in + try + let x = f oc in + close_out oc; + x + with e -> + close_out oc; + raise e + +module Make(Sexp : SEXP) = struct + include MakeBasic(Sexp) + + (** {3 Printing} *) (* shall we escape the string because of one of its chars? *) let _must_escape s = @@ -127,16 +143,6 @@ module Make(Sexp : SEXP) = struct (** {2 Parsing} *) - let _with_in filename f = - let ic = open_in filename in - try - let x = f ic in - close_in ic; - x - with e -> - close_in ic; - Error (Printexc.to_string e) - (** A parser of ['a] can return [Yield x] when it parsed a value, or [Fail e] when a parse error was encountered, or [End] if the input was empty *) @@ -190,7 +196,7 @@ module Make(Sexp : SEXP) = struct let _u = expr() in (* discard next sexp *) expr() | L.ATOM s -> - junk t; + junk t; begin match Sexp.make_loc with | None -> Sexp.atom s | Some f -> @@ -213,7 +219,7 @@ module Make(Sexp : SEXP) = struct let loc = f (pair_of_pos_ pos_start) (pair_of_pos_ t.buf.lex_curr_p) - t.buf.lex_curr_p.pos_fname in + t.buf.lex_curr_p.pos_fname in Sexp.list_with_loc ~loc l end | _ -> error_ t.buf "expected ')'" @@ -293,6 +299,189 @@ module Make(Sexp : SEXP) = struct let parse_file_list filename = _with_in filename (parse_chan_list_ ~file:filename) end +module MakeCanonical(Sexp : BASIC_SEXP) = struct + include MakeBasic(Sexp) + + (** {3 Printing} *) + + let rec to_buf b t = + Sexp.match_ t + ~atom:(fun s -> Printf.bprintf b "%d:%s" (String.length s) s) + ~list:(function + | [] -> Buffer.add_string b "()" + | [x] -> Printf.bprintf b "(%a)" to_buf x + | l -> + Buffer.add_char b '('; + List.iter (to_buf b) l; + Buffer.add_char b ')') + + let to_string t = + let b = Buffer.create 128 in + to_buf b t; + Buffer.contents b + + let rec pp_noindent fmt t = + Sexp.match_ t + ~atom:(fun s -> Format.fprintf fmt "%d:%s" (String.length s) s) + ~list:(function + | [] -> Format.pp_print_string fmt "()" + | [x] -> Format.fprintf fmt "(%a)" pp_noindent x + | l -> + Format.fprintf fmt "("; + List.iter (pp_noindent fmt) l; + Format.fprintf fmt ")") + + let pp = pp_noindent + + let rec to_chan oc t = + Sexp.match_ t + ~atom:(fun s -> Printf.fprintf oc "%d:%s" (String.length s) s) + ~list:(function + | [] -> output_string oc "()" + | [x] -> Printf.fprintf oc "(%a)" to_chan x + | l -> + output_char oc '('; + List.iter (to_chan oc) l; + output_char oc ')') + + let to_file_iter filename iter = + _with_out filename + (fun oc -> iter (fun t -> to_chan oc t)) + + let to_file filename t = to_file_iter filename (fun k -> k t) + + (** {3 Parsing} *) + + module type INPUT = sig + exception EOF + val read_char : unit -> char + val read_string : int -> string + end + + module Decoder(I:INPUT) = struct + let[@inline] is_num_ c = Char.code c >= Char.code '0' && Char.code c <= Char.code '9' + let[@inline] as_num_ c = Char.code c - Char.code '0' + + let next_ () : sexp or_error * bool = + let rec read_string_len n = + match I.read_char () with + | c when is_num_ c -> read_string_len (n * 10 + as_num_ c) + | ':' -> + let s = I.read_string n in + atom s + | _ -> failwith "expected string length" + + and eat_colon () = + match I.read_char () with + | ':' -> () + | _ -> failwith "expected ':'" + + and read_in_paren acc = + match I.read_char () with + | ')' -> list (List.rev acc) + | c when is_num_ c -> + let sexp = read_string_len (as_num_ c) in + read_in_paren (sexp::acc) + | '(' -> + let sexp = read_in_paren [] in + read_in_paren (sexp::acc) + | _ -> failwith "expected list of sexprs" + in + (* read a S-expr *) + try + begin match I.read_char () with + | exception I.EOF -> Error "unexpected EOF", true + | '(' -> Ok (read_in_paren []), false + | '0' -> eat_colon (); Ok (atom ""), false + | c when is_num_ c -> Ok (read_string_len (as_num_ c)), false + | _ -> Error "unexpected char, expected toplevel sexpr", false + end + with Failure e -> Error e, false + + let to_list () : _ or_error = + let rec iter acc = + match next_ () with + | Error _, true -> Ok (List.rev acc) + | Ok x, _ -> iter (x::acc) + | Error _ as res, _ -> res + in + try iter [] + with e -> Error (Printexc.to_string e) + + let[@inline] next_or_error () : _ or_error = fst (next_ ()) + end[@@inline] + + module Decoder_str(X : sig val s : string end) = + Decoder(struct + exception EOF + let i = ref 0 + let n = String.length X.s + let read_char () = + if !i >= n then raise_notrace EOF; + let c = String.unsafe_get X.s !i in + incr i; + c + let read_string len = + if !i + len > n then raise_notrace EOF; + let res = String.sub X.s !i len in + i := !i + len; + res + end)[@@inline] + + let parse_string s : t or_error = + let module D = Decoder_str(struct let s=s end) in + D.next_or_error () + + let parse_string_list s : t list or_error = + let module D = Decoder_str(struct let s=s end) in + D.to_list () + + module Decoder_ic(X : sig val ic : in_channel end) = + Decoder(struct + exception EOF = End_of_file + let[@inline] read_char () = input_char X.ic + let read_string n = + match n with + | 0 -> "" + | 1 -> String.make 1 (read_char ()) + | _ -> + let buf = Bytes.make n '\000' in + let i = ref 0 in + while !i < n do + let len = input X.ic buf !i (n - !i) in + i := !i + len; + done; + Bytes.unsafe_to_string buf + end)[@@inline] + + let parse_chan_ ?file ic : sexp or_error = + let module D = Decoder_ic(struct let ic=ic end) in + match D.next_or_error(), file with + | Error s, Some file -> Error (Printf.sprintf "%s in '%s'" s file) + | r, _ -> r + + let parse_chan_list_ ?file ic = + let module D = Decoder_ic(struct let ic=ic end) in + match D.to_list (), file with + | Error s, Some file -> Error (Printf.sprintf "%s in '%s'" s file) + | r, _ -> r + + let parse_chan ic = parse_chan_ ic + let parse_chan_list ic = parse_chan_list_ ic + + let parse_chan_gen ic = + let module D = Decoder_ic(struct let ic=ic end) in + fun () -> + match D.next_ () with + | _, true -> None + | Error e, _ -> Some (Error e) + | Ok x, _ -> Some (Ok x) + + let parse_file filename = _with_in filename (parse_chan_ ~file:filename) + + let parse_file_list filename = _with_in filename (parse_chan_list_ ~file:filename) +end + type t = [ | `Atom of string | `List of t list @@ -321,20 +510,22 @@ and compare a b = match a, b with | `Atom _, _ -> -1 | `List _, _ -> 1 -include (Make(struct - type t_ = t - type t = t_ - type loc = unit - let make_loc = None - let atom x = `Atom x - let list x = `List x - let atom_with_loc ~loc:_ s = atom s - let list_with_loc ~loc:_ l = list l +module Basic_ = struct + type nonrec t = t + type loc = unit + let make_loc = None + let atom x = `Atom x + let list x = `List x + let atom_with_loc ~loc:_ s = atom s + let list_with_loc ~loc:_ l = list l - let match_ x ~atom ~list = match x with - | `Atom x -> atom x - | `List l -> list l - end) : S with type t := t) + let match_ x ~atom ~list = match x with + | `Atom x -> atom x + | `List l -> list l +end + +include (Make(Basic_) : S with type t := t) +module Canonical = MakeCanonical(Basic_) (*$T CCResult.to_opt (parse_string "(abc d/e/f \"hello \\\" () world\" )") <> None @@ -361,11 +552,18 @@ include (Make(struct (*$inject let sexp_bijective s = to_string s |> parse_string = Ok s + let csexp_bijective s = Canonical.to_string s |> Canonical.parse_string = Ok s +*) + +(*$= & ~printer:CCFormat.(to_string (Dump.result Canonical.pp)) + (Ok (`List [`Atom ""])) (parse_string "(\"\")") + (Ok (`List [`Atom ""])) (Canonical.parse_string {|(0:)|}) + (Ok (`List [`Atom "a"; `Atom "b "])) (Canonical.parse_string {|(1:a2:b )|}) *) (*$T - (Ok (`List [`Atom ""]) = (parse_string "(\"\")")) sexp_bijective (`List [`Atom ""]) + csexp_bijective (`List [`Atom ""]) *) (*$inject @@ -398,6 +596,17 @@ include (Make(struct (*$Q & ~count:100 sexp_gen sexp_bijective + sexp_gen csexp_bijective +*) + +(*$R + let s1 = + `List (CCList.init 100_000 + (fun i -> `List [`Atom "-"; `Atom (string_of_int i); `Atom ")(\n]"])) in + let str = Canonical.to_string s1 in + match Canonical.parse_string str with + | Ok s2 -> assert_equal s1 s2 + | Error e -> assert_failure e *) let atom s : t = `Atom s diff --git a/src/core/CCSexp.mli b/src/core/CCSexp.mli index 449cd5ce..697e486f 100644 --- a/src/core/CCSexp.mli +++ b/src/core/CCSexp.mli @@ -10,6 +10,11 @@ type 'a gen = unit -> 'a option (** {2 Abstract representation of S-expressions} + @since NEXT_RELEASE *) +module type BASIC_SEXP = CCSexp_intf.BASIC_SEXP + +(** {2 Abstract representation of S-expressions (extended)} + @since 2.7 *) module type SEXP = CCSexp_intf.SEXP @@ -18,6 +23,10 @@ module type SEXP = CCSexp_intf.SEXP @since 2.7 *) module type S = CCSexp_intf.S +(** Subset of {!S} + @since NEXT_RELEASE *) +module type S0 = CCSexp_intf.S0 + (** {2 Functorized operations} This builds a parser and printer for S-expressions represented as @@ -26,6 +35,12 @@ module type S = CCSexp_intf.S @since 2.7 *) module Make(Sexp : SEXP) : S with type t = Sexp.t +(** {2 Parser and printer for Canonical S-exprs} + + See {{: https://en.wikipedia.org/wiki/Canonical_S-expressions} wikipedia}. + These S-expressions are binary safe. *) +module MakeCanonical(Sexp : BASIC_SEXP) : S0 with type t = Sexp.t + (** {2 Basics} *) (** A simple, structural representation of S-expressions. *) @@ -36,6 +51,10 @@ type t = [ include S with type t := t +(** Canonical encoding + @since NEXT_RELEASE *) +module Canonical : S0 with type t = t + val equal : t -> t -> bool (** @since 3.0 *) diff --git a/src/core/CCSexp_intf.ml b/src/core/CCSexp_intf.ml index 6fc61e42..02458a0c 100644 --- a/src/core/CCSexp_intf.ml +++ b/src/core/CCSexp_intf.ml @@ -4,10 +4,22 @@ type 'a iter = ('a -> unit) -> unit type 'a gen = unit -> 'a option (** {2 Abstract representation of S-expressions} + @since NEXT_RELEASE *) +module type BASIC_SEXP = sig + type t + + val atom : string -> t + val list : t list -> t + + val match_ : t -> atom:(string -> 'a) -> list:(t list -> 'a) -> 'a +end + +(** {2 Abstract representation of S-expressions (extended)} @since 2.7 *) module type SEXP = sig - type t + include BASIC_SEXP + type loc val make_loc : ((int * int) -> (int * int) -> string -> loc) option @@ -16,17 +28,12 @@ module type SEXP = sig val atom_with_loc : loc:loc -> string -> t val list_with_loc : loc:loc -> t list -> t - - val atom : string -> t - val list : t list -> t - - val match_ : t -> atom:(string -> 'a) -> list:(t list -> 'a) -> 'a end (** {2 Operations over S-expressions} @since 2.7 *) -module type S = sig +module type S0 = sig type t type sexp = t @@ -86,29 +93,6 @@ module type S = sig (** {2 Parsing} *) - (** A parser of ['a] can return [Yield x] when it parsed a value, - or [Fail e] when a parse error was encountered, or - [End] if the input was empty. *) - type 'a parse_result = - | Yield of 'a - | Fail of string - | End - - module Decoder : sig - type t - (** Decoder *) - - val of_lexbuf : Lexing.lexbuf -> t - - val next : t -> sexp parse_result - (** Parse the next S-expression or return an error if the input isn't - long enough or isn't a proper S-expression. *) - - val to_list : t -> sexp list or_error - (** Read all the values from this decoder. - @since 2.8 *) - end - val parse_string : string -> t or_error (** Parse a string. *) @@ -131,4 +115,37 @@ module type S = sig val parse_file_list : string -> t list or_error (** Open the file and read a S-exp from it. *) + +end + +(** {2 Operations over S-expressions (extended)} + + @since 2.7 *) +module type S = sig + include S0 + + (** {2 Parsing} *) + + (** A parser of ['a] can return [Yield x] when it parsed a value, + or [Fail e] when a parse error was encountered, or + [End] if the input was empty. *) + type 'a parse_result = + | Yield of 'a + | Fail of string + | End + + module Decoder : sig + type t + (** Decoder *) + + val of_lexbuf : Lexing.lexbuf -> t + + val next : t -> sexp parse_result + (** Parse the next S-expression or return an error if the input isn't + long enough or isn't a proper S-expression. *) + + val to_list : t -> sexp list or_error + (** Read all the values from this decoder. + @since 2.8 *) + end end diff --git a/src/core/CCSexp_lex.mll b/src/core/CCSexp_lex.mll index 881338df..76e0ecca 100644 --- a/src/core/CCSexp_lex.mll +++ b/src/core/CCSexp_lex.mll @@ -88,3 +88,4 @@ rule token = parse | string { ATOM (remove_quotes lexbuf (Lexing.lexeme lexbuf)) } | _ as c { error lexbuf (Printf.sprintf "lexing failed on char `%c`" c) } +