feat(ccsexp): printer and parser for canonical S-exprs

This commit is contained in:
Simon Cruanes 2021-02-02 22:16:15 -05:00
parent 0097fd3c3d
commit b2b1d2b5fa
4 changed files with 313 additions and 67 deletions

View file

@ -8,12 +8,14 @@ type 'a or_error = ('a, string) result
type 'a gen = unit -> 'a option
module type SEXP = CCSexp_intf.SEXP
module type BASIC_SEXP = CCSexp_intf.BASIC_SEXP
module type S = CCSexp_intf.S
module type S0 = CCSexp_intf.S0
let equal_string (a : string) b = Stdlib.(=) a b
let compare_string (a : string) b = Stdlib.compare a b
module Make(Sexp : SEXP) = struct
module MakeBasic(Sexp : BASIC_SEXP) = struct
type t = Sexp.t
type sexp = t
@ -33,18 +35,32 @@ module Make(Sexp : SEXP) = struct
let of_field name t = Sexp.list [Sexp.atom name; t]
let of_record l =
Sexp.list (List.map (fun (n,x) -> of_field n x) l)
end
(** {2 Printing} *)
let _with_in filename f =
let ic = open_in filename in
try
let x = f ic in
close_in ic;
x
with e ->
close_in ic;
Error (Printexc.to_string e)
let _with_out filename f =
let oc = open_out filename in
try
let x = f oc in
close_out oc;
x
with e ->
close_out oc;
raise e
let _with_out filename f =
let oc = open_out filename in
try
let x = f oc in
close_out oc;
x
with e ->
close_out oc;
raise e
module Make(Sexp : SEXP) = struct
include MakeBasic(Sexp)
(** {3 Printing} *)
(* shall we escape the string because of one of its chars? *)
let _must_escape s =
@ -127,16 +143,6 @@ module Make(Sexp : SEXP) = struct
(** {2 Parsing} *)
let _with_in filename f =
let ic = open_in filename in
try
let x = f ic in
close_in ic;
x
with e ->
close_in ic;
Error (Printexc.to_string e)
(** A parser of ['a] can return [Yield x] when it parsed a value,
or [Fail e] when a parse error was encountered, or
[End] if the input was empty *)
@ -293,6 +299,189 @@ module Make(Sexp : SEXP) = struct
let parse_file_list filename = _with_in filename (parse_chan_list_ ~file:filename)
end
module MakeCanonical(Sexp : BASIC_SEXP) = struct
include MakeBasic(Sexp)
(** {3 Printing} *)
let rec to_buf b t =
Sexp.match_ t
~atom:(fun s -> Printf.bprintf b "%d:%s" (String.length s) s)
~list:(function
| [] -> Buffer.add_string b "()"
| [x] -> Printf.bprintf b "(%a)" to_buf x
| l ->
Buffer.add_char b '(';
List.iter (to_buf b) l;
Buffer.add_char b ')')
let to_string t =
let b = Buffer.create 128 in
to_buf b t;
Buffer.contents b
let rec pp_noindent fmt t =
Sexp.match_ t
~atom:(fun s -> Format.fprintf fmt "%d:%s" (String.length s) s)
~list:(function
| [] -> Format.pp_print_string fmt "()"
| [x] -> Format.fprintf fmt "(%a)" pp_noindent x
| l ->
Format.fprintf fmt "(";
List.iter (pp_noindent fmt) l;
Format.fprintf fmt ")")
let pp = pp_noindent
let rec to_chan oc t =
Sexp.match_ t
~atom:(fun s -> Printf.fprintf oc "%d:%s" (String.length s) s)
~list:(function
| [] -> output_string oc "()"
| [x] -> Printf.fprintf oc "(%a)" to_chan x
| l ->
output_char oc '(';
List.iter (to_chan oc) l;
output_char oc ')')
let to_file_iter filename iter =
_with_out filename
(fun oc -> iter (fun t -> to_chan oc t))
let to_file filename t = to_file_iter filename (fun k -> k t)
(** {3 Parsing} *)
module type INPUT = sig
exception EOF
val read_char : unit -> char
val read_string : int -> string
end
module Decoder(I:INPUT) = struct
let[@inline] is_num_ c = Char.code c >= Char.code '0' && Char.code c <= Char.code '9'
let[@inline] as_num_ c = Char.code c - Char.code '0'
let next_ () : sexp or_error * bool =
let rec read_string_len n =
match I.read_char () with
| c when is_num_ c -> read_string_len (n * 10 + as_num_ c)
| ':' ->
let s = I.read_string n in
atom s
| _ -> failwith "expected string length"
and eat_colon () =
match I.read_char () with
| ':' -> ()
| _ -> failwith "expected ':'"
and read_in_paren acc =
match I.read_char () with
| ')' -> list (List.rev acc)
| c when is_num_ c ->
let sexp = read_string_len (as_num_ c) in
read_in_paren (sexp::acc)
| '(' ->
let sexp = read_in_paren [] in
read_in_paren (sexp::acc)
| _ -> failwith "expected list of sexprs"
in
(* read a S-expr *)
try
begin match I.read_char () with
| exception I.EOF -> Error "unexpected EOF", true
| '(' -> Ok (read_in_paren []), false
| '0' -> eat_colon (); Ok (atom ""), false
| c when is_num_ c -> Ok (read_string_len (as_num_ c)), false
| _ -> Error "unexpected char, expected toplevel sexpr", false
end
with Failure e -> Error e, false
let to_list () : _ or_error =
let rec iter acc =
match next_ () with
| Error _, true -> Ok (List.rev acc)
| Ok x, _ -> iter (x::acc)
| Error _ as res, _ -> res
in
try iter []
with e -> Error (Printexc.to_string e)
let[@inline] next_or_error () : _ or_error = fst (next_ ())
end[@@inline]
module Decoder_str(X : sig val s : string end) =
Decoder(struct
exception EOF
let i = ref 0
let n = String.length X.s
let read_char () =
if !i >= n then raise_notrace EOF;
let c = String.unsafe_get X.s !i in
incr i;
c
let read_string len =
if !i + len > n then raise_notrace EOF;
let res = String.sub X.s !i len in
i := !i + len;
res
end)[@@inline]
let parse_string s : t or_error =
let module D = Decoder_str(struct let s=s end) in
D.next_or_error ()
let parse_string_list s : t list or_error =
let module D = Decoder_str(struct let s=s end) in
D.to_list ()
module Decoder_ic(X : sig val ic : in_channel end) =
Decoder(struct
exception EOF = End_of_file
let[@inline] read_char () = input_char X.ic
let read_string n =
match n with
| 0 -> ""
| 1 -> String.make 1 (read_char ())
| _ ->
let buf = Bytes.make n '\000' in
let i = ref 0 in
while !i < n do
let len = input X.ic buf !i (n - !i) in
i := !i + len;
done;
Bytes.unsafe_to_string buf
end)[@@inline]
let parse_chan_ ?file ic : sexp or_error =
let module D = Decoder_ic(struct let ic=ic end) in
match D.next_or_error(), file with
| Error s, Some file -> Error (Printf.sprintf "%s in '%s'" s file)
| r, _ -> r
let parse_chan_list_ ?file ic =
let module D = Decoder_ic(struct let ic=ic end) in
match D.to_list (), file with
| Error s, Some file -> Error (Printf.sprintf "%s in '%s'" s file)
| r, _ -> r
let parse_chan ic = parse_chan_ ic
let parse_chan_list ic = parse_chan_list_ ic
let parse_chan_gen ic =
let module D = Decoder_ic(struct let ic=ic end) in
fun () ->
match D.next_ () with
| _, true -> None
| Error e, _ -> Some (Error e)
| Ok x, _ -> Some (Ok x)
let parse_file filename = _with_in filename (parse_chan_ ~file:filename)
let parse_file_list filename = _with_in filename (parse_chan_list_ ~file:filename)
end
type t = [
| `Atom of string
| `List of t list
@ -321,20 +510,22 @@ and compare a b = match a, b with
| `Atom _, _ -> -1
| `List _, _ -> 1
include (Make(struct
type t_ = t
type t = t_
type loc = unit
let make_loc = None
let atom x = `Atom x
let list x = `List x
let atom_with_loc ~loc:_ s = atom s
let list_with_loc ~loc:_ l = list l
module Basic_ = struct
type nonrec t = t
type loc = unit
let make_loc = None
let atom x = `Atom x
let list x = `List x
let atom_with_loc ~loc:_ s = atom s
let list_with_loc ~loc:_ l = list l
let match_ x ~atom ~list = match x with
| `Atom x -> atom x
| `List l -> list l
end) : S with type t := t)
let match_ x ~atom ~list = match x with
| `Atom x -> atom x
| `List l -> list l
end
include (Make(Basic_) : S with type t := t)
module Canonical = MakeCanonical(Basic_)
(*$T
CCResult.to_opt (parse_string "(abc d/e/f \"hello \\\" () world\" )") <> None
@ -361,11 +552,18 @@ include (Make(struct
(*$inject
let sexp_bijective s = to_string s |> parse_string = Ok s
let csexp_bijective s = Canonical.to_string s |> Canonical.parse_string = Ok s
*)
(*$= & ~printer:CCFormat.(to_string (Dump.result Canonical.pp))
(Ok (`List [`Atom ""])) (parse_string "(\"\")")
(Ok (`List [`Atom ""])) (Canonical.parse_string {|(0:)|})
(Ok (`List [`Atom "a"; `Atom "b "])) (Canonical.parse_string {|(1:a2:b )|})
*)
(*$T
(Ok (`List [`Atom ""]) = (parse_string "(\"\")"))
sexp_bijective (`List [`Atom ""])
csexp_bijective (`List [`Atom ""])
*)
(*$inject
@ -398,6 +596,17 @@ include (Make(struct
(*$Q & ~count:100
sexp_gen sexp_bijective
sexp_gen csexp_bijective
*)
(*$R
let s1 =
`List (CCList.init 100_000
(fun i -> `List [`Atom "-"; `Atom (string_of_int i); `Atom ")(\n]"])) in
let str = Canonical.to_string s1 in
match Canonical.parse_string str with
| Ok s2 -> assert_equal s1 s2
| Error e -> assert_failure e
*)
let atom s : t = `Atom s

View file

@ -10,6 +10,11 @@ type 'a gen = unit -> 'a option
(** {2 Abstract representation of S-expressions}
@since NEXT_RELEASE *)
module type BASIC_SEXP = CCSexp_intf.BASIC_SEXP
(** {2 Abstract representation of S-expressions (extended)}
@since 2.7 *)
module type SEXP = CCSexp_intf.SEXP
@ -18,6 +23,10 @@ module type SEXP = CCSexp_intf.SEXP
@since 2.7 *)
module type S = CCSexp_intf.S
(** Subset of {!S}
@since NEXT_RELEASE *)
module type S0 = CCSexp_intf.S0
(** {2 Functorized operations}
This builds a parser and printer for S-expressions represented as
@ -26,6 +35,12 @@ module type S = CCSexp_intf.S
@since 2.7 *)
module Make(Sexp : SEXP) : S with type t = Sexp.t
(** {2 Parser and printer for Canonical S-exprs}
See {{: https://en.wikipedia.org/wiki/Canonical_S-expressions} wikipedia}.
These S-expressions are binary safe. *)
module MakeCanonical(Sexp : BASIC_SEXP) : S0 with type t = Sexp.t
(** {2 Basics} *)
(** A simple, structural representation of S-expressions. *)
@ -36,6 +51,10 @@ type t = [
include S with type t := t
(** Canonical encoding
@since NEXT_RELEASE *)
module Canonical : S0 with type t = t
val equal : t -> t -> bool
(** @since 3.0 *)

View file

@ -4,10 +4,22 @@ type 'a iter = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
(** {2 Abstract representation of S-expressions}
@since NEXT_RELEASE *)
module type BASIC_SEXP = sig
type t
val atom : string -> t
val list : t list -> t
val match_ : t -> atom:(string -> 'a) -> list:(t list -> 'a) -> 'a
end
(** {2 Abstract representation of S-expressions (extended)}
@since 2.7 *)
module type SEXP = sig
type t
include BASIC_SEXP
type loc
val make_loc : ((int * int) -> (int * int) -> string -> loc) option
@ -16,17 +28,12 @@ module type SEXP = sig
val atom_with_loc : loc:loc -> string -> t
val list_with_loc : loc:loc -> t list -> t
val atom : string -> t
val list : t list -> t
val match_ : t -> atom:(string -> 'a) -> list:(t list -> 'a) -> 'a
end
(** {2 Operations over S-expressions}
@since 2.7 *)
module type S = sig
module type S0 = sig
type t
type sexp = t
@ -86,29 +93,6 @@ module type S = sig
(** {2 Parsing} *)
(** A parser of ['a] can return [Yield x] when it parsed a value,
or [Fail e] when a parse error was encountered, or
[End] if the input was empty. *)
type 'a parse_result =
| Yield of 'a
| Fail of string
| End
module Decoder : sig
type t
(** Decoder *)
val of_lexbuf : Lexing.lexbuf -> t
val next : t -> sexp parse_result
(** Parse the next S-expression or return an error if the input isn't
long enough or isn't a proper S-expression. *)
val to_list : t -> sexp list or_error
(** Read all the values from this decoder.
@since 2.8 *)
end
val parse_string : string -> t or_error
(** Parse a string. *)
@ -131,4 +115,37 @@ module type S = sig
val parse_file_list : string -> t list or_error
(** Open the file and read a S-exp from it. *)
end
(** {2 Operations over S-expressions (extended)}
@since 2.7 *)
module type S = sig
include S0
(** {2 Parsing} *)
(** A parser of ['a] can return [Yield x] when it parsed a value,
or [Fail e] when a parse error was encountered, or
[End] if the input was empty. *)
type 'a parse_result =
| Yield of 'a
| Fail of string
| End
module Decoder : sig
type t
(** Decoder *)
val of_lexbuf : Lexing.lexbuf -> t
val next : t -> sexp parse_result
(** Parse the next S-expression or return an error if the input isn't
long enough or isn't a proper S-expression. *)
val to_list : t -> sexp list or_error
(** Read all the values from this decoder.
@since 2.8 *)
end
end

View file

@ -88,3 +88,4 @@ rule token = parse
| string { ATOM (remove_quotes lexbuf (Lexing.lexeme lexbuf)) }
| _ as c
{ error lexbuf (Printf.sprintf "lexing failed on char `%c`" c) }