ocaml-containers/src/sexp/CCSexpM.ml
2016-11-03 09:48:50 +01:00

379 lines
10 KiB
OCaml

(* This file is free software, part of containers. See file "license" for more details. *)
(** {1 Simple S-expression parsing/printing} *)
type 'a or_error = [ `Ok of 'a | `Error of string ]
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
type t = [
| `Atom of string
| `List of t list
]
type sexp = t
let _with_in filename f =
let ic = open_in filename in
try
let x = f ic in
close_in ic;
x
with e ->
close_in ic;
`Error (Printexc.to_string e)
let _with_out filename f =
let oc = open_out filename in
try
let x = f oc in
close_out oc;
x
with e ->
close_out oc;
raise e
(** {2 Serialization (encoding)} *)
(* shall we escape the string because of one of its chars? *)
let _must_escape s =
try
for i = 0 to String.length s - 1 do
let c = String.unsafe_get s i in
match c with
| ' ' | ';' | ')' | '(' | '"' | '\\' | '\n' | '\t' | '\r' -> raise Exit
| _ when Char.code c > 127 -> raise Exit (* non-ascii *)
| _ -> ()
done;
false
with Exit -> true
let rec to_buf b t = match t with
| `Atom s when _must_escape s -> Printf.bprintf b "\"%s\"" (String.escaped s)
| `Atom s -> Buffer.add_string b s
| `List [] -> Buffer.add_string b "()"
| `List [x] -> Printf.bprintf b "(%a)" to_buf x
| `List l ->
Buffer.add_char b '(';
List.iteri
(fun i t' -> (if i > 0 then Buffer.add_char b ' '; to_buf b t'))
l;
Buffer.add_char b ')'
let to_string t =
let b = Buffer.create 128 in
to_buf b t;
Buffer.contents b
let rec print fmt t = match t with
| `Atom s when _must_escape s -> Format.fprintf fmt "\"%s\"" (String.escaped s)
| `Atom s -> Format.pp_print_string fmt s
| `List [] -> Format.pp_print_string fmt "()"
| `List [x] -> Format.fprintf fmt "@[<hov2>(%a)@]" print x
| `List l ->
Format.fprintf fmt "@[<hov1>(";
List.iteri
(fun i t' -> (if i > 0 then Format.fprintf fmt "@ "; print fmt t'))
l;
Format.fprintf fmt ")@]"
let rec print_noindent fmt t = match t with
| `Atom s when _must_escape s -> Format.fprintf fmt "\"%s\"" (String.escaped s)
| `Atom s -> Format.pp_print_string fmt s
| `List [] -> Format.pp_print_string fmt "()"
| `List [x] -> Format.fprintf fmt "(%a)" print_noindent x
| `List l ->
Format.pp_print_char fmt '(';
List.iteri
(fun i t' -> (if i > 0 then Format.pp_print_char fmt ' '; print_noindent fmt t'))
l;
Format.pp_print_char fmt ')'
let to_chan oc t =
let fmt = Format.formatter_of_out_channel oc in
print fmt t;
Format.pp_print_flush fmt ()
let to_file_seq filename seq =
_with_out filename
(fun oc ->
seq (fun t -> to_chan oc t; output_char oc '\n')
)
let to_file filename t = to_file_seq filename (fun k -> k t)
(** {2 Deserialization (decoding)} *)
module type MONAD = sig
type 'a t
val return : 'a -> 'a t
val (>>=) : 'a t -> ('a -> 'b t) -> 'b t
end
type 'a parse_result = ['a or_error | `End ]
module MakeDecode(M : MONAD) = struct
let (>>=) = M.(>>=)
type t = {
buf : Bytes.t;
refill : Bytes.t -> int -> int -> int M.t;
atom : Buffer.t;
mutable i : int; (* offset in [buf] *)
mutable len : int; (* how many bytes of [buf] are usable *)
mutable line : int;
mutable col : int;
}
let make ?(bufsize=1024) refill =
let bufsize = min (max bufsize 16) Sys.max_string_length in
{ buf=Bytes.create bufsize;
refill;
atom = Buffer.create 32;
i=0;
len=0;
line=1;
col=1;
}
let _is_digit c = Char.code '0' <= Char.code c && Char.code c <= Char.code '9'
let _digit2i c = Char.code c - Char.code '0'
(* refill buffer. If it works, call k_succ, otherwise call k_fail *)
let _refill t k_succ k_fail =
t.refill t.buf 0 (Bytes.length t.buf)
>>= fun n ->
t.i <- 0;
t.len <- n;
if n = 0 then k_fail t else k_succ t
(* get next char, assuming t.i < t.len *)
let _get t =
assert (t.i < t.len);
let c = Bytes.get t.buf t.i in
t.i <- t.i + 1;
if c = '\n' then (t.col <- 1; t.line <- t.line + 1) else t.col <- t.col + 1;
c
(* return an error *)
let _error t msg =
let b = Buffer.create 32 in
Printf.bprintf b "at %d, %d: " t.line t.col;
Printf.kbprintf
(fun b ->
let msg' = Buffer.contents b in
M.return (`Error msg')
) b msg
let _error_eof t = _error t "unexpected end of input"
(* The parsers all take a success continuation, and the decoder as
last arguments. The continuation is used to minimize the
number of calls to [>>=] and take two parameters, the next
char (if not consumed), and the returned expression itself *)
(* read expression *)
let rec expr k t =
if t.i = t.len then _refill t (expr k) _error_eof
else match _get t with
| ' ' | '\t' | '\r' | '\n' -> expr k t
| c -> expr_starting_with c k t
and expr_starting_with c k t = match c with
| ' ' | '\t' | '\r' | '\n' -> assert false
| ';' -> skip_comment (fun _ () -> expr k t) t
| '(' -> expr_list [] k t
| ')' -> _error t "unexpected ')'"
| '\\' -> _error t "unexpected '\\'"
| '"' -> quoted k t
| c ->
Buffer.add_char t.atom c;
atom k t
(* parse list *)
and expr_list acc k t =
if t.i = t.len then _refill t (expr_list acc k) _error_eof
else match _get t with
| ' ' | '\t' | '\r' | '\n' -> expr_list acc k t
| ')' -> k None (`List (List.rev acc))
| c ->
expr_starting_with c
(fun last e -> match last with
| Some '(' -> expr_list [] (fun _ l -> expr_list (l::acc) k t) t
| Some ')' -> k None (`List (List.rev (e::acc)))
| _ -> expr_list (e::acc) k t
) t
(* return the current atom (last char: c) *)
and _return_atom last k t =
let s = Buffer.contents t.atom in
Buffer.clear t.atom;
k last (`Atom s)
(* parse atom *)
and atom k t =
if t.i = t.len then _refill t (atom k) (_return_atom None k)
else match _get t with
| '\\' -> _error t "unexpected '\\' in non-quoted string"
| '"' -> _error t "unexpected '\"' in the middle of an atom"
| (' ' | '\r' | '\n' | '\t' | '(' | ')') as c ->
_return_atom (Some c) k t
| c ->
Buffer.add_char t.atom c;
atom k t
(* quoted string *)
and quoted k t =
if t.i = t.len then _refill t (quoted k) _error_eof
else match _get t with
| '\\' ->
(* read escaped char and continue *)
escaped
(fun c ->
Buffer.add_char t.atom c;
quoted k t
) t
| '"' -> _return_atom None k t
| c ->
Buffer.add_char t.atom c;
quoted k t
(* read escaped char *)
and escaped k t =
if t.i = t.len then _refill t (escaped k) _error_eof
else match _get t with
| 'n' -> k '\n'
| 't' -> k '\t'
| 'r' -> k '\r'
| 'b' -> k '\b'
| '\\' -> k '\\'
| '"' -> k '"'
| c when _is_digit c ->
read2int (_digit2i c) (fun n -> k (Char.chr n)) t
| c -> _error t "unexpected escaped char '%c'" c
and read2int i k t =
if t.i = t.len then _refill t (read2int i k) _error_eof
else match _get t with
| c when _is_digit c -> read1int (10 * i + _digit2i c) k t
| c -> _error t "unexpected char '%c' when reading byte" c
and read1int i k t =
if t.i = t.len then _refill t (read1int i k) _error_eof
else match _get t with
| c when _is_digit c -> k (10 * i + _digit2i c)
| c -> _error t "unexpected char '%c' when reading byte" c
(* skip until end of line, then call next() *)
and skip_comment k t =
if t.i = t.len
then _refill t (skip_comment k) _error_eof
else match _get t with
| '\n' -> k None ()
| _ -> skip_comment k t
(* top-level expression *)
let rec expr_or_end k t =
if t.i = t.len
then _refill t (expr_or_end k) (fun _ -> M.return `End)
else match _get t with
| ' ' | '\t' | '\r' | '\n' -> expr_or_end k t
| c -> expr_starting_with c k t
(* entry point *)
let next t : sexp parse_result M.t =
expr_or_end (fun _ x -> M.return (`Ok x)) t
end
module ID_MONAD = struct
type 'a t = 'a
let return x = x
let (>>=) x f = f x
end
module D = MakeDecode(ID_MONAD)
let parse_string s : t or_error =
let n = String.length s in
let stop = ref false in
let refill bytes i _len =
if !stop then 0
else (stop := true; Bytes.blit_string s 0 bytes i n; n)
in
let d = D.make ~bufsize:n refill in
match D.next d with
| `End -> `Error "unexpected end of file"
| (`Ok _ | `Error _) as res -> res
(*$T
CCError.to_opt (parse_string "(abc d/e/f \"hello \\\" () world\" )") <> None
CCError.to_opt (parse_string "(abc ( d e ffff ) \"hello/world\")") <> None
(parse_string "(abc\r\n ( d e \r\tffff ))") \
= `Ok (`List [`Atom "abc"; `List [`Atom "d"; `Atom "e"; `Atom "ffff"]])
*)
(*$inject
let sexp_gen =
let mkatom a = `Atom a and mklist l = `List l in
let atom = Q.Gen.(map mkatom (string_size ~gen:printable (1 -- 30))) in
let gen = Q.Gen.(
sized (fix
(fun self n st -> match n with
| 0 -> atom st
| _ ->
frequency
[ 1, atom
; 2, map mklist (list_size (0 -- 10) (self (n/10)))
] st
)
)) in
let rec small = function
| `Atom s -> String.length s
| `List l -> List.fold_left (fun n x->n+small x) 0 l
and print = function
| `Atom s -> Printf.sprintf "`Atom \"%s\"" s
| `List l -> "`List " ^ Q.Print.list print l
and shrink = function
| `Atom s -> Q.Iter.map mkatom (Q.Shrink.string s)
| `List l -> Q.Iter.map mklist (Q.Shrink.list ~shrink l)
in
Q.make ~print ~small ~shrink gen
let rec sexp_valid = function
| `Atom "" -> false
| `Atom _ -> true
| `List l -> List.for_all sexp_valid l
*)
(*$Q & ~count:100
sexp_gen (fun s -> sexp_valid s ==> (to_string s |> parse_string = `Ok s))
*)
let parse_chan ?bufsize ic =
let d = D.make ?bufsize (input ic) in
match D.next d with
| `End -> `Error "unexpected end of file"
| (`Ok _ | `Error _) as res -> res
let parse_chan_gen ?bufsize ic =
let d = D.make ?bufsize (input ic) in
fun () ->
match D.next d with
| `End -> None
| `Error _ as e -> Some e
| `Ok _ as res -> Some res
let parse_chan_list ?bufsize ic =
let d = D.make ?bufsize (input ic) in
let rec iter acc = match D.next d with
| `End -> `Ok (List.rev acc)
| `Ok x -> iter (x::acc)
| `Error _ as e -> e
in
iter []
let parse_file filename =
_with_in filename (fun ic -> parse_chan ic)
let parse_file_list filename =
_with_in filename (fun ic -> parse_chan_list ic)