new interface for Sexp.Streaming, allowing to provide a 'source' (generator of tokens)

This commit is contained in:
Simon Cruanes 2014-09-17 20:46:33 +02:00
parent dcf134b1eb
commit 33d33ec6e4
2 changed files with 291 additions and 196 deletions

View file

@ -118,7 +118,96 @@ let to_file filename t = seq_to_file filename (fun k -> k t)
type 'a parse_result = ['a or_error | `End ] type 'a parse_result = ['a or_error | `End ]
type 'a partial_result = [ 'a parse_result | `Await ] type 'a partial_result = [ 'a parse_result | `Await ]
module Streaming = struct module Source = struct
type individual_char =
| NC_yield of char
| NC_end
| NC_await
type t = unit -> individual_char
type source = t
module Manual = struct
type t = {
mutable i : int; (* offset *)
mutable stop : bool;
buf : Buffer.t; (* accessible chunk of input *)
}
let make() = {
i = 0;
stop = false;
buf=Buffer.create 32;
}
let to_src d () =
if d.i = Buffer.length d.buf
then
if d.stop then NC_end else NC_await
else (
let c = Buffer.nth d.buf d.i in
d.i <- d.i + 1;
NC_yield c
)
let feed d s i len =
if d.stop then failwith "Sexp.Streaming.Manual.feed: reached EOI";
Buffer.add_substring d.buf s i len
let reached_end d = d.stop <- true
end
let of_string s =
let i = ref 0 in
fun () ->
if !i=String.length s
then NC_end
else (
let c = String.get s !i in
incr i;
NC_yield c
)
let of_chan ?(bufsize=1024) ic =
let buf = String.make bufsize ' ' in
let i = ref 0 in
let n = ref 0 in
let stop = ref false in
let rec next() =
if !stop then NC_end
else if !i = !n
then ( (* refill *)
i := 0;
n := input ic buf 0 bufsize;
if !n = 0 then (stop := true; NC_end) else next()
) else ( (* yield *)
let c = String.get buf !i in
incr i;
NC_yield c
)
in next
let of_gen g =
let s = ref "" in
let i = ref 0 in
let stop = ref false in
let rec next() =
if !stop then NC_end
else if !i = String.length !s
then (
match g() with
| None -> stop := true; NC_end
| Some buf -> s := buf; i := 0; next ()
) else (
let c = String.get !s !i in
incr i;
NC_yield c
)
in next
end
module Lexer = struct
(** An individual character returned by a source *)
type token = type token =
| Open | Open
| Close | Close
@ -135,60 +224,39 @@ module Streaming = struct
| St_error of string | St_error of string
| St_end | St_end
type decoder = { type t = {
src : Source.t;
atom : Buffer.t; (* atom being parsed *)
mutable st : decode_state; mutable st : decode_state;
mutable i : int;
mutable line : int; mutable line : int;
mutable col : int; mutable col : int;
mutable stop : bool;
buf : Buffer.t;
atom : Buffer.t; (* atom being parsed *)
} }
let mk_decoder () = { let make src = {
i = 0; src;
st = St_start; st = St_start;
line = 1; line = 1;
col = 1; col = 1;
stop = false;
buf=Buffer.create 32;
atom = Buffer.create 32; atom = Buffer.create 32;
} }
exception NeedMoar let of_string s = make (Source.of_string s)
exception Error of string
exception EOI let of_chan ic = make (Source.of_chan ic)
let line t = t.line
let col t = t.col
(* yield [x] with current state [st] *) (* yield [x] with current state [st] *)
let _yield d st x = let _yield d st x =
d.st <- st; d.st <- st;
x `Ok x
(* read the next char *)
let _next_char d =
if d.i = Buffer.length d.buf
then (
(* need more input; reset buffer to put it in *)
Buffer.clear d.buf;
d.i <- 0;
raise NeedMoar
) else (
let c = Buffer.nth d.buf d.i in
d.i <- d.i + 1;
d.col <- d.col + 1;
c
)
let _take_buffer b = let _take_buffer b =
let s = Buffer.contents b in let s = Buffer.contents b in
Buffer.clear b; Buffer.clear b;
s s
let _newline d =
d.line <- d.line + 1;
d.col <- 0;
()
(* raise an error *) (* raise an error *)
let _error d msg = let _error d msg =
let b = Buffer.create 32 in let b = Buffer.create 32 in
@ -197,121 +265,116 @@ module Streaming = struct
(fun b -> (fun b ->
let msg' = Buffer.contents b in let msg' = Buffer.contents b in
d.st <- St_error msg'; d.st <- St_error msg';
raise (Error msg')) `Error msg')
b msg b msg
let _end d = let _end d =
d.st <- St_end; d.st <- St_end;
raise EOI `End
let _is_digit c = Char.code '0' <= Char.code c && Char.code c <= Char.code '9' let _is_digit c = Char.code '0' <= Char.code c && Char.code c <= Char.code '9'
let _digit2i c = Char.code c - Char.code '0' let _digit2i c = Char.code c - Char.code '0'
(* next token *) (* next token *)
let rec _next d st = let rec _next d st : token partial_result =
d.st <- st;
match st with match st with
| St_error msg -> raise (Error msg) | St_error msg -> `Error msg
| St_end -> _end d | St_end -> _end d
| St_yield x -> | St_yield x ->
(* yield the given token, then start a fresh one *) (* yield the given token, then start a fresh one *)
_yield d St_start x _yield d St_start x
| St_start when d.stop -> _end d | _ ->
| St_start -> d.st <- st;
(* start reading next token *) _process_next d st
let c = _next_char d in
begin match c with
| '\n' -> _newline d; _next d St_start
| ' ' | '\t' -> _next d St_start
| '(' -> _yield d St_start Open
| ')' -> _yield d St_start Close
| '"' -> _next d St_quoted
| _ -> (* read regular atom *)
Buffer.add_char d.atom c;
_next d St_atom
end
| St_atom when d.stop ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| St_atom ->
(* reading an unquoted atom *)
let c = _next_char d in
begin match c with
| '\n' ->
_newline d;
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| ' ' | '\t' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| ')' ->
let a = _take_buffer d.atom in
_yield d (St_yield Close) (Atom a)
| '(' ->
let a = _take_buffer d.atom in
_yield d (St_yield Open) (Atom a)
| '"' -> _error d "unexpected \" (parsing atom %s)" (Buffer.contents d.atom)
| '\\' -> _error d "unexpected \\"
| _ ->
Buffer.add_char d.atom c;
_next d St_atom
end
| St_quoted when d.stop ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| St_quoted ->
(* reading an unquoted atom *)
let c = _next_char d in
begin match c with
| '\\' -> _next d St_escaped
| '"' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| _ ->
Buffer.add_char d.atom c;
_next d St_quoted
end
| (St_escaped | St_raw_char1 _ | St_raw_char2 _) when d.stop ->
_error d "unexpected end of input (escaping)"
| St_escaped ->
begin match _next_char d with
| 'n' -> Buffer.add_char d.atom '\n'; _next d St_quoted
| 't' -> Buffer.add_char d.atom '\t'; _next d St_quoted
| 'r' -> Buffer.add_char d.atom '\r'; _next d St_quoted
| 'b' -> Buffer.add_char d.atom '\b'; _next d St_quoted
| '"' -> Buffer.add_char d.atom '"'; _next d St_quoted
| '\\' -> Buffer.add_char d.atom '\\'; _next d St_quoted
| c when _is_digit c -> _next d (St_raw_char1 (_digit2i c))
| c -> _error d "unexpected escaped character %c" c
end
| St_raw_char1 i ->
begin match _next_char d with
| c when _is_digit c -> _next d (St_raw_char2 (i*10 + _digit2i c))
| c -> _error d "expected digit, got %c" c
end
| St_raw_char2 i ->
begin match _next_char d with
| c when _is_digit c ->
(* read an escaped char *)
Buffer.add_char d.atom (Char.chr (i*10+_digit2i c));
_next d St_quoted
| c -> _error d "expected digit, got %c" c
end
let feed d s i len = (* read and proces the next character *)
if d.stop then failwith "Sexp.Streaming.feed: end of input reached"; and _process_next d st =
Buffer.add_substring d.buf s i len match d.src () with
| Source.NC_end ->
begin match st with
| St_error _ | St_end | St_yield _ -> assert false
| St_start -> _end d
| St_atom ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| St_quoted ->
let a = _take_buffer d.atom in
_yield d St_end (Atom a)
| (St_escaped | St_raw_char1 _ | St_raw_char2 _) ->
_error d "unexpected end of input (escaping)"
end
| Source.NC_await -> `Await
| Source.NC_yield c ->
if c='\n'
then (d.col <- 1; d.line <- d.line + 1)
else (d.col <- d.col + 1);
(* use the next char *)
match st with
| St_error _ | St_end | St_yield _ -> assert false
| St_start ->
begin match c with
| ' ' | '\t' | '\n' -> _next d St_start
| '(' -> _yield d St_start Open
| ')' -> _yield d St_start Close
| '"' -> _next d St_quoted
| _ -> (* read regular atom *)
Buffer.add_char d.atom c;
_next d St_atom
end
| St_atom ->
begin match c with
| ' ' | '\t' | '\n' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| ')' ->
let a = _take_buffer d.atom in
_yield d (St_yield Close) (Atom a)
| '(' ->
let a = _take_buffer d.atom in
_yield d (St_yield Open) (Atom a)
| '"' -> _error d "unexpected \" (parsing atom %s)" (Buffer.contents d.atom)
| '\\' -> _error d "unexpected \\"
| _ ->
Buffer.add_char d.atom c;
_next d St_atom
end
| St_quoted ->
(* reading an unquoted atom *)
begin match c with
| '\\' -> _next d St_escaped
| '"' ->
let a = _take_buffer d.atom in
_yield d St_start (Atom a)
| _ ->
Buffer.add_char d.atom c;
_next d St_quoted
end
| St_escaped ->
begin match c with
| 'n' -> Buffer.add_char d.atom '\n'; _next d St_quoted
| 't' -> Buffer.add_char d.atom '\t'; _next d St_quoted
| 'r' -> Buffer.add_char d.atom '\r'; _next d St_quoted
| 'b' -> Buffer.add_char d.atom '\b'; _next d St_quoted
| '"' -> Buffer.add_char d.atom '"'; _next d St_quoted
| '\\' -> Buffer.add_char d.atom '\\'; _next d St_quoted
| _ when _is_digit c -> _next d (St_raw_char1 (_digit2i c))
| _ -> _error d "unexpected escaped character %c" c
end
| St_raw_char1 i ->
begin match c with
| _ when _is_digit c -> _next d (St_raw_char2 (i*10 + _digit2i c))
| _ -> _error d "expected digit, got %c" c
end
| St_raw_char2 i ->
begin match c with
| c when _is_digit c ->
(* read an escaped char *)
Buffer.add_char d.atom (Char.chr (i*10+_digit2i c));
_next d St_quoted
| c -> _error d "expected digit, got %c" c
end
let reached_end d = let next d = _next d d.st
d.stop <- true
let next d =
try
`Ok (_next d d.st)
with
| NeedMoar -> `Await
| Error msg -> `Error msg
| EOI -> `End
end end
module ParseGen = struct module ParseGen = struct
@ -347,28 +410,28 @@ end
(* hidden parser state *) (* hidden parser state *)
type parser_state = { type parser_state = {
ps_d : Streaming.decoder; ps_d : Lexer.t;
mutable ps_stack : t list list; mutable ps_stack : t list list;
} }
let mk_ps () = { let mk_ps src = {
ps_d = Streaming.mk_decoder (); ps_d = Lexer.make src;
ps_stack = []; ps_stack = [];
} }
let _error ps msg = let _error ps msg =
let msg' = Printf.sprintf "at %d,%d: %s" let msg' = Printf.sprintf "at %d,%d: %s" (Lexer.line ps.ps_d) (Lexer.col ps.ps_d) msg in
ps.ps_d.Streaming.line ps.ps_d.Streaming.col msg in
`Error msg' `Error msg'
(* next token, or await *) (* next token, or await *)
let rec _next ps : t partial_result = match Streaming.next ps.ps_d with let rec _next ps : t partial_result =
| `Ok (Streaming.Atom s) -> match Lexer.next ps.ps_d with
| `Ok (Lexer.Atom s) ->
_push ps (Atom s) _push ps (Atom s)
| `Ok Streaming.Open -> | `Ok Lexer.Open ->
ps.ps_stack <- [] :: ps.ps_stack; ps.ps_stack <- [] :: ps.ps_stack;
_next ps _next ps
| `Ok Streaming.Close -> | `Ok Lexer.Close ->
begin match ps.ps_stack with begin match ps.ps_stack with
| [] -> _error ps "unbalanced ')'" | [] -> _error ps "unbalanced ')'"
| l :: stack -> | l :: stack ->
@ -387,49 +450,25 @@ and _push ps e = match ps.ps_stack with
ps.ps_stack <- (e :: l) :: tl; ps.ps_stack <- (e :: l) :: tl;
_next ps _next ps
(* assume [ps] never needs [`Await] *)
let _never_block ps () = match _next ps with
| `Await -> assert false
| `Ok x -> `Ok x
| `Error e -> `Error e
| `End -> `End
(* parse from a generator of string slices *) (* parse from a generator of string slices *)
let _parse_gen g : t ParseGen.t = let parse_gen g : t ParseGen.t =
let ps = mk_ps() in let ps = mk_ps (Source.of_gen g) in
let rec next () = match _next ps with _never_block ps
| `Await ->
begin match g() with
| None -> Streaming.reached_end ps.ps_d
| Some (s,i,len) -> Streaming.feed ps.ps_d s i len
end;
next()
| `Ok x -> `Ok x
| `Error e -> `Error e
| `End -> `End
in
next
let parse_gen g =
_parse_gen
(fun () ->
match g() with
| None -> None
| Some s -> Some (s,0,String.length s)
)
(* singleton generator *)
let _gen1 x =
let first = ref true in
fun () ->
if !first then (first:=false; Some x) else None
let parse_string s = let parse_string s =
parse_gen (_gen1 s) let ps = mk_ps (Source.of_string s) in
_never_block ps
let parse_chan ?(bufsize=1024) ic = let parse_chan ?bufsize ic =
let buf = String.make bufsize ' ' in let ps = mk_ps (Source.of_chan ?bufsize ic) in
let stop = ref false in _never_block ps
let gen () =
if !stop then None
else
let n = input ic buf 0 bufsize in
if n=0 then (stop:=true; None) else Some (buf,0,n)
in
_parse_gen gen
(** {6 Blocking} *) (** {6 Blocking} *)
@ -458,22 +497,26 @@ let parse_l_string s =
let parse_l_gen g = let parse_l_gen g =
ParseGen.to_list (parse_gen g) ParseGen.to_list (parse_gen g)
exception OhNoes of string
exception StopNaow
let parse_l_seq seq = let parse_l_seq seq =
let ps = mk_ps() in let src = Source.Manual.make () in
let ps = mk_ps (Source.Manual.to_src src) in
let l = ref [] in let l = ref [] in
(* read as many expressions as possible *) (* read as many expressions as possible *)
let rec _nexts () = match _next ps with let rec _nexts () = match _next ps with
| `Ok x -> l := x :: !l; _nexts () | `Ok x -> l := x :: !l; _nexts ()
| `Error e -> raise (Streaming.Error e) | `Error e -> raise (OhNoes e)
| `End -> raise Streaming.EOI | `End -> raise StopNaow
| `Await -> () | `Await -> ()
in in
try try
seq seq
(fun s -> Streaming.feed ps.ps_d s 0 (String.length s); _nexts ()); (fun s -> Source.Manual.feed src s 0 (String.length s); _nexts ());
Streaming.reached_end ps.ps_d; Source.Manual.reached_end src;
_nexts (); _nexts ();
`Ok (List.rev !l) `Ok (List.rev !l)
with with
| Streaming.Error msg -> `Error msg | OhNoes msg -> `Error msg
| Streaming.EOI -> `Ok (List.rev !l) | StopNaow -> `Ok (List.rev !l)

View file

@ -47,10 +47,13 @@ val to_file : string -> t -> unit
val to_chan : out_channel -> t -> unit val to_chan : out_channel -> t -> unit
val print : Format.formatter -> t -> unit val print : Format.formatter -> t -> unit
(** Pretty-printer nice on human eyes (including indentation) *)
val print_noindent : Format.formatter -> t -> unit val print_noindent : Format.formatter -> t -> unit
(** Raw, direct printing as compact as possible *)
val seq_to_file : string -> t sequence -> unit val seq_to_file : string -> t sequence -> unit
(** Print the given sequence of expressions to a file *)
(** {2 Deserialization (decoding)} *) (** {2 Deserialization (decoding)} *)
@ -59,16 +62,65 @@ type 'a partial_result = [ 'a parse_result | `Await ]
(** {6 Streaming Parsing} *) (** {6 Streaming Parsing} *)
module Streaming : sig module Source : sig
type decoder type individual_char =
| NC_yield of char
| NC_end
| NC_await
(** An individual character returned by a source *)
val mk_decoder : unit -> decoder type t = unit -> individual_char
(** A source of characters can yield them one by one, or signal the end,
or signal that some external intervention is needed *)
val feed : decoder -> string -> int -> int -> unit type source = t
(** Feed a chunk of input to the decoder *)
val reached_end : decoder -> unit (** A mnual source of individual characters. When it has exhausted its
(** Tell the decoder that end of input has been reached *) data, it asked its caller to provide more, or signal that none remains
In particular, useful when the source of data is monadic IO *)
module Manual : sig
type t
val make : unit -> t
(** Make a new manual source. It needs to be fed input manually,
using {!feed} *)
val to_src : t -> source
(** The manual source contains a source! *)
val feed : t -> string -> int -> int -> unit
(** Feed a chunk of input to the manual source *)
val reached_end : t -> unit
(** Tell the decoder that end of input has been reached. From now
the source will only yield [NC_end] *)
end
val of_string : string -> t
(** Use a single string as the source *)
val of_chan : ?bufsize:int -> in_channel -> t
(** Use a channel as the source *)
val of_gen : string gen -> t
end
module Lexer : sig
type t
(** A streaming lexer, that parses atomic chunks of S-expressions (atoms
and delimiters) *)
val make : Source.t -> t
(** Create a lexer that uses the given source of characters as an input *)
val of_string : string -> t
val of_chan : in_channel -> t
val line : t -> int
val col : t -> int
(** Obtain next token *)
type token = type token =
| Open | Open
@ -76,7 +128,7 @@ module Streaming : sig
| Atom of string | Atom of string
(** An individual S-exp token *) (** An individual S-exp token *)
val next : decoder -> token partial_result val next : t -> token partial_result
(** Obtain the next token, an error, or block/end stream *) (** Obtain the next token, an error, or block/end stream *)
end end