new interface for Sexp.Streaming, allowing to provide a 'source' (generator of tokens)

This commit is contained in:
Simon Cruanes 2014-09-17 20:46:33 +02:00
parent dcf134b1eb
commit 33d33ec6e4
2 changed files with 291 additions and 196 deletions

View file

@ -118,7 +118,96 @@ let to_file filename t = seq_to_file filename (fun k -> k t)
type 'a parse_result = ['a or_error | `End ]
type 'a partial_result = [ 'a parse_result | `Await ]
module Streaming = struct
module Source = struct
type individual_char =
| NC_yield of char
| NC_end
| NC_await
type t = unit -> individual_char
type source = t
module Manual = struct
type t = {
mutable i : int; (* offset *)
mutable stop : bool;
buf : Buffer.t; (* accessible chunk of input *)
}
let make() = {
i = 0;
stop = false;
buf=Buffer.create 32;
}
let to_src d () =
if d.i = Buffer.length d.buf
then
if d.stop then NC_end else NC_await
else (
let c = Buffer.nth d.buf d.i in
d.i <- d.i + 1;
NC_yield c
)
let feed d s i len =
if d.stop then failwith "Sexp.Streaming.Manual.feed: reached EOI";
Buffer.add_substring d.buf s i len
let reached_end d = d.stop <- true
end
let of_string s =
let i = ref 0 in
fun () ->
if !i=String.length s
then NC_end
else (
let c = String.get s !i in
incr i;
NC_yield c
)
let of_chan ?(bufsize=1024) ic =
let buf = String.make bufsize ' ' in
let i = ref 0 in
let n = ref 0 in
let stop = ref false in
let rec next() =
if !stop then NC_end
else if !i = !n
then ( (* refill *)
i := 0;
n := input ic buf 0 bufsize;
if !n = 0 then (stop := true; NC_end) else next()
) else ( (* yield *)
let c = String.get buf !i in
incr i;
NC_yield c
)
in next
let of_gen g =
let s = ref "" in
let i = ref 0 in
let stop = ref false in
let rec next() =
if !stop then NC_end
else if !i = String.length !s
then (
match g() with
| None -> stop := true; NC_end
| Some buf -> s := buf; i := 0; next ()
) else (
let c = String.get !s !i in
incr i;
NC_yield c
)
in next
end
module Lexer = struct
(** An individual character returned by a source *)
type token =
| Open
| Close
@ -135,60 +224,39 @@ module Streaming = struct
| St_error of string
| St_end
type decoder = {
type t = {
src : Source.t;
atom : Buffer.t; (* atom being parsed *)
mutable st : decode_state;
mutable i : int;
mutable line : int;
mutable col : int;
mutable stop : bool;
buf : Buffer.t;
atom : Buffer.t; (* atom being parsed *)
}
let mk_decoder () = {
i = 0;
let make src = {
src;
st = St_start;
line = 1;
col = 1;
stop = false;
buf=Buffer.create 32;
atom = Buffer.create 32;
}
exception NeedMoar
exception Error of string
exception EOI
let of_string s = make (Source.of_string s)
let of_chan ic = make (Source.of_chan ic)
let line t = t.line
let col t = t.col
(* yield [x] with current state [st] *)
let _yield d st x =
d.st <- st;
x
(* read the next char *)
let _next_char d =
if d.i = Buffer.length d.buf
then (
(* need more input; reset buffer to put it in *)
Buffer.clear d.buf;
d.i <- 0;
raise NeedMoar
) else (
let c = Buffer.nth d.buf d.i in
d.i <- d.i + 1;
d.col <- d.col + 1;
c
)
`Ok x
let _take_buffer b =
let s = Buffer.contents b in
Buffer.clear b;
s
let _newline d =
d.line <- d.line + 1;
d.col <- 0;
()
(* raise an error *)
let _error d msg =
let b = Buffer.create 32 in
@ -197,121 +265,116 @@ module Streaming = struct
(fun b ->
let msg' = Buffer.contents b in
d.st <- St_error msg';
raise (Error msg'))
`Error msg')
b msg
let _end d =
d.st <- St_end;
raise EOI
`End
let _is_digit c = Char.code '0' <= Char.code c && Char.code c <= Char.code '9'
let _digit2i c = Char.code c - Char.code '0'
(* next token *)
let rec _next d st =
d.st <- st;
let rec _next d st : token partial_result =
match st with
| St_error msg -> raise (Error msg)
| St_error msg -> `Error msg
| St_end -> _end d
| St_yield x ->
(* yield the given token, then start a fresh one *)
_yield d St_start x
| St_start when d.stop -> _end d
| St_start ->
(* start reading next token *)
let c = _next_char d in
begin match c with
| '\n' -> _newline d; _next d St_start
| ' ' | '\t' -> _next d St_start
| '(' -> _yield d St_start Open
| ')' -> _yield d St_start Close
| '"' -> _next d St_quoted
| _ -> (* read regular atom *)
Buffer.add_char d.atom c;
_next d St_atom
end
| St_atom when d.stop ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| St_atom ->
(* reading an unquoted atom *)
let c = _next_char d in
begin match c with
| '\n' ->
_newline d;
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| ' ' | '\t' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| ')' ->
let a = _take_buffer d.atom in
_yield d (St_yield Close) (Atom a)
| '(' ->
let a = _take_buffer d.atom in
_yield d (St_yield Open) (Atom a)
| '"' -> _error d "unexpected \" (parsing atom %s)" (Buffer.contents d.atom)
| '\\' -> _error d "unexpected \\"
| _ ->
Buffer.add_char d.atom c;
_next d St_atom
end
| St_quoted when d.stop ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| St_quoted ->
(* reading an unquoted atom *)
let c = _next_char d in
begin match c with
| '\\' -> _next d St_escaped
| '"' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| _ ->
Buffer.add_char d.atom c;
_next d St_quoted
end
| (St_escaped | St_raw_char1 _ | St_raw_char2 _) when d.stop ->
_error d "unexpected end of input (escaping)"
| St_escaped ->
begin match _next_char d with
| 'n' -> Buffer.add_char d.atom '\n'; _next d St_quoted
| 't' -> Buffer.add_char d.atom '\t'; _next d St_quoted
| 'r' -> Buffer.add_char d.atom '\r'; _next d St_quoted
| 'b' -> Buffer.add_char d.atom '\b'; _next d St_quoted
| '"' -> Buffer.add_char d.atom '"'; _next d St_quoted
| '\\' -> Buffer.add_char d.atom '\\'; _next d St_quoted
| c when _is_digit c -> _next d (St_raw_char1 (_digit2i c))
| c -> _error d "unexpected escaped character %c" c
end
| St_raw_char1 i ->
begin match _next_char d with
| c when _is_digit c -> _next d (St_raw_char2 (i*10 + _digit2i c))
| c -> _error d "expected digit, got %c" c
end
| St_raw_char2 i ->
begin match _next_char d with
| c when _is_digit c ->
(* read an escaped char *)
Buffer.add_char d.atom (Char.chr (i*10+_digit2i c));
_next d St_quoted
| c -> _error d "expected digit, got %c" c
end
| _ ->
d.st <- st;
_process_next d st
let feed d s i len =
if d.stop then failwith "Sexp.Streaming.feed: end of input reached";
Buffer.add_substring d.buf s i len
(* read and proces the next character *)
and _process_next d st =
match d.src () with
| Source.NC_end ->
begin match st with
| St_error _ | St_end | St_yield _ -> assert false
| St_start -> _end d
| St_atom ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| St_quoted ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| (St_escaped | St_raw_char1 _ | St_raw_char2 _) ->
_error d "unexpected end of input (escaping)"
end
| Source.NC_await -> `Await
| Source.NC_yield c ->
if c='\n'
then (d.col <- 1; d.line <- d.line + 1)
else (d.col <- d.col + 1);
(* use the next char *)
match st with
| St_error _ | St_end | St_yield _ -> assert false
| St_start ->
begin match c with
| ' ' | '\t' | '\n' -> _next d St_start
| '(' -> _yield d St_start Open
| ')' -> _yield d St_start Close
| '"' -> _next d St_quoted
| _ -> (* read regular atom *)
Buffer.add_char d.atom c;
_next d St_atom
end
| St_atom ->
begin match c with
| ' ' | '\t' | '\n' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| ')' ->
let a = _take_buffer d.atom in
_yield d (St_yield Close) (Atom a)
| '(' ->
let a = _take_buffer d.atom in
_yield d (St_yield Open) (Atom a)
| '"' -> _error d "unexpected \" (parsing atom %s)" (Buffer.contents d.atom)
| '\\' -> _error d "unexpected \\"
| _ ->
Buffer.add_char d.atom c;
_next d St_atom
end
| St_quoted ->
(* reading an unquoted atom *)
begin match c with
| '\\' -> _next d St_escaped
| '"' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| _ ->
Buffer.add_char d.atom c;
_next d St_quoted
end
| St_escaped ->
begin match c with
| 'n' -> Buffer.add_char d.atom '\n'; _next d St_quoted
| 't' -> Buffer.add_char d.atom '\t'; _next d St_quoted
| 'r' -> Buffer.add_char d.atom '\r'; _next d St_quoted
| 'b' -> Buffer.add_char d.atom '\b'; _next d St_quoted
| '"' -> Buffer.add_char d.atom '"'; _next d St_quoted
| '\\' -> Buffer.add_char d.atom '\\'; _next d St_quoted
| _ when _is_digit c -> _next d (St_raw_char1 (_digit2i c))
| _ -> _error d "unexpected escaped character %c" c
end
| St_raw_char1 i ->
begin match c with
| _ when _is_digit c -> _next d (St_raw_char2 (i*10 + _digit2i c))
| _ -> _error d "expected digit, got %c" c
end
| St_raw_char2 i ->
begin match c with
| c when _is_digit c ->
(* read an escaped char *)
Buffer.add_char d.atom (Char.chr (i*10+_digit2i c));
_next d St_quoted
| c -> _error d "expected digit, got %c" c
end
let reached_end d =
d.stop <- true
let next d =
try
`Ok (_next d d.st)
with
| NeedMoar -> `Await
| Error msg -> `Error msg
| EOI -> `End
let next d = _next d d.st
end
module ParseGen = struct
@ -347,28 +410,28 @@ end
(* hidden parser state *)
type parser_state = {
ps_d : Streaming.decoder;
ps_d : Lexer.t;
mutable ps_stack : t list list;
}
let mk_ps () = {
ps_d = Streaming.mk_decoder ();
let mk_ps src = {
ps_d = Lexer.make src;
ps_stack = [];
}
let _error ps msg =
let msg' = Printf.sprintf "at %d,%d: %s"
ps.ps_d.Streaming.line ps.ps_d.Streaming.col msg in
let msg' = Printf.sprintf "at %d,%d: %s" (Lexer.line ps.ps_d) (Lexer.col ps.ps_d) msg in
`Error msg'
(* next token, or await *)
let rec _next ps : t partial_result = match Streaming.next ps.ps_d with
| `Ok (Streaming.Atom s) ->
let rec _next ps : t partial_result =
match Lexer.next ps.ps_d with
| `Ok (Lexer.Atom s) ->
_push ps (Atom s)
| `Ok Streaming.Open ->
| `Ok Lexer.Open ->
ps.ps_stack <- [] :: ps.ps_stack;
_next ps
| `Ok Streaming.Close ->
| `Ok Lexer.Close ->
begin match ps.ps_stack with
| [] -> _error ps "unbalanced ')'"
| l :: stack ->
@ -387,49 +450,25 @@ and _push ps e = match ps.ps_stack with
ps.ps_stack <- (e :: l) :: tl;
_next ps
(* assume [ps] never needs [`Await] *)
let _never_block ps () = match _next ps with
| `Await -> assert false
| `Ok x -> `Ok x
| `Error e -> `Error e
| `End -> `End
(* parse from a generator of string slices *)
let _parse_gen g : t ParseGen.t =
let ps = mk_ps() in
let rec next () = match _next ps with
| `Await ->
begin match g() with
| None -> Streaming.reached_end ps.ps_d
| Some (s,i,len) -> Streaming.feed ps.ps_d s i len
end;
next()
| `Ok x -> `Ok x
| `Error e -> `Error e
| `End -> `End
in
next
let parse_gen g =
_parse_gen
(fun () ->
match g() with
| None -> None
| Some s -> Some (s,0,String.length s)
)
(* singleton generator *)
let _gen1 x =
let first = ref true in
fun () ->
if !first then (first:=false; Some x) else None
let parse_gen g : t ParseGen.t =
let ps = mk_ps (Source.of_gen g) in
_never_block ps
let parse_string s =
parse_gen (_gen1 s)
let ps = mk_ps (Source.of_string s) in
_never_block ps
let parse_chan ?(bufsize=1024) ic =
let buf = String.make bufsize ' ' in
let stop = ref false in
let gen () =
if !stop then None
else
let n = input ic buf 0 bufsize in
if n=0 then (stop:=true; None) else Some (buf,0,n)
in
_parse_gen gen
let parse_chan ?bufsize ic =
let ps = mk_ps (Source.of_chan ?bufsize ic) in
_never_block ps
(** {6 Blocking} *)
@ -458,22 +497,26 @@ let parse_l_string s =
let parse_l_gen g =
ParseGen.to_list (parse_gen g)
exception OhNoes of string
exception StopNaow
let parse_l_seq seq =
let ps = mk_ps() in
let src = Source.Manual.make () in
let ps = mk_ps (Source.Manual.to_src src) in
let l = ref [] in
(* read as many expressions as possible *)
let rec _nexts () = match _next ps with
| `Ok x -> l := x :: !l; _nexts ()
| `Error e -> raise (Streaming.Error e)
| `End -> raise Streaming.EOI
| `Error e -> raise (OhNoes e)
| `End -> raise StopNaow
| `Await -> ()
in
try
seq
(fun s -> Streaming.feed ps.ps_d s 0 (String.length s); _nexts ());
Streaming.reached_end ps.ps_d;
(fun s -> Source.Manual.feed src s 0 (String.length s); _nexts ());
Source.Manual.reached_end src;
_nexts ();
`Ok (List.rev !l)
with
| Streaming.Error msg -> `Error msg
| Streaming.EOI -> `Ok (List.rev !l)
| OhNoes msg -> `Error msg
| StopNaow -> `Ok (List.rev !l)

View file

@ -47,10 +47,13 @@ val to_file : string -> t -> unit
val to_chan : out_channel -> t -> unit
val print : Format.formatter -> t -> unit
(** Pretty-printer nice on human eyes (including indentation) *)
val print_noindent : Format.formatter -> t -> unit
(** Raw, direct printing as compact as possible *)
val seq_to_file : string -> t sequence -> unit
(** Print the given sequence of expressions to a file *)
(** {2 Deserialization (decoding)} *)
@ -59,16 +62,65 @@ type 'a partial_result = [ 'a parse_result | `Await ]
(** {6 Streaming Parsing} *)
module Streaming : sig
type decoder
module Source : sig
type individual_char =
| NC_yield of char
| NC_end
| NC_await
(** An individual character returned by a source *)
val mk_decoder : unit -> decoder
type t = unit -> individual_char
(** A source of characters can yield them one by one, or signal the end,
or signal that some external intervention is needed *)
val feed : decoder -> string -> int -> int -> unit
(** Feed a chunk of input to the decoder *)
type source = t
val reached_end : decoder -> unit
(** Tell the decoder that end of input has been reached *)
(** A mnual source of individual characters. When it has exhausted its
data, it asked its caller to provide more, or signal that none remains
In particular, useful when the source of data is monadic IO *)
module Manual : sig
type t
val make : unit -> t
(** Make a new manual source. It needs to be fed input manually,
using {!feed} *)
val to_src : t -> source
(** The manual source contains a source! *)
val feed : t -> string -> int -> int -> unit
(** Feed a chunk of input to the manual source *)
val reached_end : t -> unit
(** Tell the decoder that end of input has been reached. From now
the source will only yield [NC_end] *)
end
val of_string : string -> t
(** Use a single string as the source *)
val of_chan : ?bufsize:int -> in_channel -> t
(** Use a channel as the source *)
val of_gen : string gen -> t
end
module Lexer : sig
type t
(** A streaming lexer, that parses atomic chunks of S-expressions (atoms
and delimiters) *)
val make : Source.t -> t
(** Create a lexer that uses the given source of characters as an input *)
val of_string : string -> t
val of_chan : in_channel -> t
val line : t -> int
val col : t -> int
(** Obtain next token *)
type token =
| Open
@ -76,7 +128,7 @@ module Streaming : sig
| Atom of string
(** An individual S-exp token *)
val next : decoder -> token partial_result
val next : t -> token partial_result
(** Obtain the next token, an error, or block/end stream *)
end