add CCSexpM for a simpler, monadic parser of S-expressions (deprecating CCSexpStream)

This commit is contained in:
Simon Cruanes 2014-12-18 19:26:55 +01:00
parent b31bd70501
commit ed9f874d4e
6 changed files with 504 additions and 12 deletions

View file

@ -1,10 +1,16 @@
#use "topfind";;
#thread
#directory "_build/core";;
#directory "_build/misc";;
#directory "_build/pervasives/";;
#directory "_build/string";;
#directory "_build/threads";;
#require "bigarray";;
#directory "_build/src/core";;
#directory "_build/src/misc";;
#directory "_build/src/pervasives/";;
#directory "_build/src/string";;
#directory "_build/src/io";;
#directory "_build/src/iter";;
#directory "_build/src/data";;
#directory "_build/src/sexp";;
#directory "_build/src/bigarray/";;
#directory "_build/src/threads";;
#directory "_build/tests/";;
#load "containers.cma";;
#load "containers_iter.cma";;
@ -13,14 +19,11 @@
#load "containers_sexp.cma";;
#load "containers_string.cma";;
#load "containers_pervasives.cma";;
#load "containers_bigarray.cma";;
#load "containers_misc.cma";;
#thread;;
#load "containers_thread.cma";;
open Containers_misc;;
#install_printer CCSexp.print;;
#require "CamlGI";;
#load "containers_cgi.cma";;
let pp_html fmt h = Format.pp_print_string fmt (ToWeb.HTML.render h);;
#install_printer pp_html;;
(* vim:syntax=ocaml:
*)

10
_oasis
View file

@ -57,7 +57,7 @@ Library "containers_io"
Library "containers_sexp"
Path: src/sexp
Modules: CCSexp
Modules: CCSexp, CCSexpStream, CCSexpM
BuildDepends: bytes
FindlibParent: containers
FindlibName: sexp
@ -236,6 +236,14 @@ Executable id_sexp
Build$: flag(misc)
BuildDepends: containers.sexp
Executable id_sexp2
Path: examples/
Install: false
CompiledObject: native
MainIs: id_sexp2.ml
Build$: flag(misc)
BuildDepends: containers.sexp
SourceRepository head
Type: git
Location: https://github.com/c-cube/ocaml-containers

View file

@ -3,11 +3,11 @@
let () =
if Array.length Sys.argv <> 2 then failwith "usage: id_sexp file";
let f = Sys.argv.(1) in
let s = CCSexp.L.of_file f in
let s = CCSexpStream.L.of_file f in
match s with
| `Ok l ->
List.iter
(fun s -> Format.printf "@[%a@]@." CCSexp.print s)
(fun s -> Format.printf "@[%a@]@." CCSexpStream.print s)
l
| `Error msg ->
Format.printf "error: %s@." msg

13
examples/id_sexp2.ml Normal file
View file

@ -0,0 +1,13 @@
let () =
if Array.length Sys.argv <> 2 then failwith "usage: id_sexp file";
let f = Sys.argv.(1) in
let s = CCSexpM.parse_file_list f in
match s with
| `Ok l ->
List.iter
(fun s -> Format.printf "@[%a@]@." CCSexpM.print s)
l
| `Error msg ->
Format.printf "error: %s@." msg

362
src/sexp/CCSexpM.ml Normal file
View file

@ -0,0 +1,362 @@
(*
Copyright (c) 2013, Simon Cruanes
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. Redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Simple S-expression parsing/printing} *)
type 'a or_error = [ `Ok of 'a | `Error of string ]
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
type t = [
| `Atom of string
| `List of t list
]
type sexp = t
let _with_in filename f =
let ic = open_in filename in
try
let x = f ic in
close_in ic;
x
with e ->
close_in ic;
`Error (Printexc.to_string e)
let _with_out filename f =
let oc = open_out filename in
try
let x = f oc in
close_out oc;
x
with e ->
close_out oc;
raise e
(** {2 Serialization (encoding)} *)
(* shall we escape the string because of one of its chars? *)
let _must_escape s =
try
for i = 0 to String.length s - 1 do
let c = String.unsafe_get s i in
match c with
| ' ' | ';' | ')' | '(' | '"' | '\n' | '\t' -> raise Exit
| _ when Char.code c > 127 -> raise Exit (* non-ascii *)
| _ -> ()
done;
false
with Exit -> true
let rec to_buf b t = match t with
| `Atom s when _must_escape s -> Printf.bprintf b "\"%s\"" (String.escaped s)
| `Atom s -> Buffer.add_string b s
| `List [] -> Buffer.add_string b "()"
| `List [x] -> Printf.bprintf b "(%a)" to_buf x
| `List l ->
Buffer.add_char b '(';
List.iteri
(fun i t' -> (if i > 0 then Buffer.add_char b ' '; to_buf b t'))
l;
Buffer.add_char b ')'
let to_string t =
let b = Buffer.create 128 in
to_buf b t;
Buffer.contents b
let rec print fmt t = match t with
| `Atom s when _must_escape s -> Format.fprintf fmt "\"%s\"" (String.escaped s)
| `Atom s -> Format.pp_print_string fmt s
| `List [] -> Format.pp_print_string fmt "()"
| `List [x] -> Format.fprintf fmt "@[<hov2>(%a)@]" print x
| `List l ->
Format.open_hovbox 2;
Format.pp_print_char fmt '(';
List.iteri
(fun i t' -> (if i > 0 then Format.fprintf fmt "@ "; print fmt t'))
l;
Format.pp_print_char fmt ')';
Format.close_box ()
let rec print_noindent fmt t = match t with
| `Atom s when _must_escape s -> Format.fprintf fmt "\"%s\"" (String.escaped s)
| `Atom s -> Format.pp_print_string fmt s
| `List [] -> Format.pp_print_string fmt "()"
| `List [x] -> Format.fprintf fmt "(%a)" print_noindent x
| `List l ->
Format.pp_print_char fmt '(';
List.iteri
(fun i t' -> (if i > 0 then Format.pp_print_char fmt ' '; print_noindent fmt t'))
l;
Format.pp_print_char fmt ')'
let to_chan oc t =
let fmt = Format.formatter_of_out_channel oc in
print fmt t;
Format.pp_print_flush fmt ()
let to_file_seq filename seq =
_with_out filename
(fun oc ->
seq (fun t -> to_chan oc t; output_char oc '\n')
)
let to_file filename t = to_file_seq filename (fun k -> k t)
(** {2 Deserialization (decoding)} *)
module type MONAD = sig
type 'a t
val return : 'a -> 'a t
val (>>=) : 'a t -> ('a -> 'b t) -> 'b t
end
type 'a parse_result = ['a or_error | `End ]
module MakeDecode(M : MONAD) = struct
let (>>=) = M.(>>=)
type t = {
buf : Bytes.t;
refill : Bytes.t -> int -> int -> int M.t;
atom : Buffer.t;
mutable i : int; (* offset in [buf] *)
mutable len : int; (* how many bytes of [buf] are usable *)
mutable line : int;
mutable col : int;
}
let make ?(bufsize=1024) refill =
let bufsize = min (max bufsize 16) Sys.max_string_length in
{ buf=Bytes.create bufsize;
refill;
atom = Buffer.create 32;
i=0;
len=0;
line=1;
col=1;
}
let _is_digit c = Char.code '0' <= Char.code c && Char.code c <= Char.code '9'
let _digit2i c = Char.code c - Char.code '0'
(* refill buffer. If it works, call k_succ, otherwise call k_fail *)
let _refill t k_succ k_fail =
t.refill t.buf 0 (Bytes.length t.buf)
>>= fun n ->
t.i <- 0;
t.len <- n;
if n = 0 then k_fail t else k_succ t
(* get next char, assuming t.i < t.len *)
let _get t =
assert (t.i < t.len);
let c = Bytes.get t.buf t.i in
t.i <- t.i + 1;
if c = '\n' then (t.col <- 1; t.line <- t.line + 1) else t.col <- t.col + 1;
c
(* return an error *)
let _error t msg =
let b = Buffer.create 32 in
Printf.bprintf b "at %d, %d: " t.line t.col;
Printf.kbprintf
(fun b ->
let msg' = Buffer.contents b in
M.return (`Error msg')
) b msg
let _error_eof t = _error t "unexpected end of input"
(* The parsers all take a success continuation, and the decoder as
last arguments. The continuation is used to minimize the
number of calls to [>>=] and take two parameters, the next
char (if not consumed), and the returned expression itself *)
(* read expression *)
let rec expr k t =
if t.i = t.len then _refill t (expr k) _error_eof
else match _get t with
| ' ' | '\t' | '\n' -> expr k t
| c -> expr_starting_with c k t
and expr_starting_with c k t = match c with
| ' ' | '\t' | '\n' -> assert false
| ';' -> skip_comment (fun _ () -> expr k t) t
| '(' -> expr_list [] k t
| ')' -> _error t "unexpected ')'"
| '\\' -> _error t "unexpected '\\'"
| '"' -> quoted k t
| c ->
Buffer.add_char t.atom c;
atom k t
(* parse list *)
and expr_list acc k t =
if t.i = t.len then _refill t (expr_list acc k) _error_eof
else match _get t with
| ' ' | '\t' | '\n' -> expr_list acc k t
| ')' -> k None (`List (List.rev acc))
| c ->
expr_starting_with c
(fun last e -> match last with
| Some '(' -> expr_list [] (fun _ l -> expr_list (l::acc) k t) t
| Some ')' -> k None (`List (List.rev (e::acc)))
| _ -> expr_list (e::acc) k t
) t
(* return the current atom (last char: c) *)
and _return_atom last k t =
let s = Buffer.contents t.atom in
Buffer.clear t.atom;
k last (`Atom s)
(* parse atom *)
and atom k t =
if t.i = t.len then _refill t (atom k) (_return_atom None k)
else match _get t with
| '\\' -> _error t "unexpected '\\' in non-quoted string"
| '"' -> _error t "unexpected '\"' in the middle of an atom"
| (' ' | '\n' | '\t' | '(' | ')') as c ->
_return_atom (Some c) k t
| c ->
Buffer.add_char t.atom c;
atom k t
(* quoted string *)
and quoted k t =
if t.i = t.len then _refill t (quoted k) _error_eof
else match _get t with
| '\\' ->
(* read escaped char and continue *)
escaped
(fun c ->
Buffer.add_char t.atom c;
quoted k t
) t
| '"' -> _return_atom None k t
| c ->
Buffer.add_char t.atom c;
quoted k t
(* read escaped char *)
and escaped k t =
if t.i = t.len then _refill t (escaped k) _error_eof
else match _get t with
| 'n' -> k '\n'
| 't' -> k '\t'
| 'r' -> k '\r'
| 'b' -> k '\b'
| '\\' -> k '\\'
| '"' -> k '"'
| c when _is_digit c ->
read2int (_digit2i c) (fun n -> k (Char.chr n)) t
| c -> _error t "unexpected escaped char '%c'" c
and read2int i k t =
if t.i = t.len then _refill t (read2int i k) _error_eof
else match _get t with
| c when _is_digit c -> read1int (10 * i + _digit2i c) k t
| c -> _error t "unexpected char '%c' when reading byte" c
and read1int i k t =
if t.i = t.len then _refill t (read1int i k) _error_eof
else match _get t with
| c when _is_digit c -> k (10 * i + _digit2i c)
| c -> _error t "unexpected char '%c' when reading byte" c
(* skip until end of line, then call next() *)
and skip_comment k t =
if t.i = t.len
then _refill t (skip_comment k) _error_eof
else match _get t with
| '\n' -> k None ()
| _ -> skip_comment k t
(* top-level expression *)
let rec expr_or_end k t =
if t.i = t.len
then _refill t (expr_or_end k) (fun _ -> M.return `End)
else match _get t with
| ' ' | '\t' | '\n' -> expr_or_end k t
| c -> expr_starting_with c k t
(* entry point *)
let next t : sexp parse_result M.t =
expr_or_end (fun _ x -> M.return (`Ok x)) t
end
module D = MakeDecode(struct
type 'a t = 'a
let return x = x
let (>>=) x f = f x
end)
let parse_string s : t or_error =
let n = String.length s in
let stop = ref false in
let refill bytes i _len =
if !stop then 0
else (stop := true; Bytes.blit_string s 0 bytes i n; n)
in
let d = D.make ~bufsize:n refill in
match D.next d with
| `End -> `Error "unexpected end of file"
| (`Ok _ | `Error _) as res -> res
(*$T
CCError.to_opt (parse_string "(abc d/e/f \"hello \\\" () world\" )") <> None
CCError.to_opt (parse_string "(abc ( d e ffff ) \"hello/world\")") <> None
*)
let parse_chan ?bufsize ic =
let d = D.make ?bufsize (input ic) in
match D.next d with
| `End -> `Error "unexpected end of file"
| (`Ok _ | `Error _) as res -> res
let parse_chan_gen ?bufsize ic =
let d = D.make ?bufsize (input ic) in
fun () ->
match D.next d with
| `End -> None
| `Error _ as e -> Some e
| `Ok _ as res -> Some res
let parse_chan_list ?bufsize ic =
let d = D.make ?bufsize (input ic) in
let rec iter acc = match D.next d with
| `End -> `Ok (List.rev acc)
| `Ok x -> iter (x::acc)
| `Error _ as e -> e
in
iter []
let parse_file filename =
_with_in filename (fun ic -> parse_chan ic)
let parse_file_list filename =
_with_in filename (fun ic -> parse_chan_list ic)

106
src/sexp/CCSexpM.mli Normal file
View file

@ -0,0 +1,106 @@
(*
Copyright (c) 2013, Simon Cruanes
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer. Redistributions in binary
form must reproduce the above copyright notice, this list of conditions and the
following disclaimer in the documentation and/or other materials provided with
the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*)
(** {1 Simple and efficient S-expression parsing/printing}
@since NEXT_RELEASE *)
type 'a or_error = [ `Ok of 'a | `Error of string ]
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
(** {2 Basics} *)
type t = [
| `Atom of string
| `List of t list
]
type sexp = t
(** {2 Serialization (encoding)} *)
val to_buf : Buffer.t -> t -> unit
val to_string : t -> string
val to_file : string -> t -> unit
val to_file_seq : string -> t sequence -> unit
(** Print the given sequence of expressions to a file *)
val to_chan : out_channel -> t -> unit
val print : Format.formatter -> t -> unit
(** Pretty-printer nice on human eyes (including indentation) *)
val print_noindent : Format.formatter -> t -> unit
(** Raw, direct printing as compact as possible *)
(** {2 Deserialization (decoding)} *)
module type MONAD = sig
type 'a t
val return : 'a -> 'a t
val (>>=) : 'a t -> ('a -> 'b t) -> 'b t
end
type 'a parse_result = ['a or_error | `End ]
(** A parser of ['a] can return [`Ok x] when it parsed a value,
or [`Error e] when a parse error was encountered, or
[`End] if the input was empty *)
module MakeDecode(M : MONAD) : sig
type t
(** Decoder *)
val make : ?bufsize:int -> (Bytes.t -> int -> int -> int M.t) -> t
(** Make a decoder with the given function used to refill an
internal buffer. The function might return [0] if the
input is exhausted.
@param bufsize size of internal buffer *)
val next : t -> sexp parse_result M.t
(** Parse the next S-expression or return an error if the input isn't
long enough or isn't a proper S-expression *)
end
val parse_string : string -> t or_error
(** Parse a string *)
val parse_chan : ?bufsize:int -> in_channel -> t or_error
(** Parse a S-expression from the given channel. Can read more data than
necessary, so don't use this if you need finer-grained control (e.g.
to read something else {b after} the S-exp) *)
val parse_chan_gen : ?bufsize:int -> in_channel -> t or_error gen
(** Parse a channel into a generator of S-expressions *)
val parse_chan_list : ?bufsize:int -> in_channel -> t list or_error
val parse_file : string -> t or_error
(** Open the file and read a S-exp from it *)
val parse_file_list : string -> t list or_error
(** Open the file and read a S-exp from it *)