diff --git a/containers.mllib b/containers.mllib index b7256789..48724a80 100644 --- a/containers.mllib +++ b/containers.mllib @@ -19,3 +19,4 @@ Vector Bij PiCalculus Bencode +Sexp diff --git a/containers.odocl b/containers.odocl index b7256789..48724a80 100644 --- a/containers.odocl +++ b/containers.odocl @@ -19,3 +19,4 @@ Vector Bij PiCalculus Bencode +Sexp diff --git a/sexp.ml b/sexp.ml new file mode 100644 index 00000000..adff2c5c --- /dev/null +++ b/sexp.ml @@ -0,0 +1,280 @@ +(* +Copyright (c) 2013, Simon Cruanes +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. Redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Simple S-expression parsing/printing} *) + +type t = + | K of string * t (* keyword *) + | I of int + | S of string + | L of t list + +let eq a b = a = b + +let compare a b = Pervasives.compare a b + +let hash a = Hashtbl.hash a + +(** {2 Serialization (encoding)} *) + +let rec to_buf b t = match t with + | I i -> Printf.bprintf b "%d" i + | S s -> Buffer.add_string b (String.escaped s) + | K (s, t') -> + assert (s.[0] = ':'); + Buffer.add_string b s; + Buffer.add_char b ' '; + to_buf b t' + | L l -> + Buffer.add_char b '('; + List.iteri (fun i t' -> (if i > 0 then Buffer.add_char b ' '; to_buf b t')) l; + Buffer.add_char b ')' + +let to_string t = + let b = Buffer.create 32 in + to_buf b t; + Buffer.contents b + +(* TODO: improve (slow and ugly) *) +let fmt fmt t = + let b = Buffer.create 32 in + to_buf b t; + Format.pp_print_string fmt (Buffer.contents b) + +(** {2 Deserialization (decoding)} *) + +(** Deserialization is based on the {! decoder} type. Parsing can be + incremental, in which case the input is provided chunk by chunk and + the decoder contains the parsing state. Once a Sexpr value + has been parsed, other values can still be read. *) + +type decoder = { + mutable buf : string; (* input buffer *) + mutable i : int; (* index in buf *) + mutable len : int; (* length of substring to read *) + mutable c : int; (* line *) + mutable l : int; (* column *) + mutable state : parse_result; + mutable stack : partial_state list; +} (** Decoding state *) + +(** Result of parsing *) +and parse_result = + | ParseOk of t + | ParseError of string + | ParsePartial + +(** Partial state of the parser *) +and partial_state = + | PS_I of bool * int (* sign and integer *) + | PS_S of Buffer.t (* parsing a string *) + | PS_S_escape of Buffer.t (* parsing a string; prev char is \ *) + | PS_L of t list + | PS_key of string (* key, waiting for value *) + | PS_return of t (* bottom of stack *) + | PS_error of string (* error *) + +let mk_decoder () = + let dec = { + buf = ""; + i = 0; + len = 0; + c = 0; + l = 0; + state = ParsePartial; + stack = []; + } in + dec + +let is_empty dec = dec.len = 0 +let cur dec = dec.buf.[dec.i] + +let junk dec = + (* update line/column *) + (if cur dec = '\n' + then (dec.c <- 0; dec.l <- dec.l + 1) + else dec.c <- dec.c + 1); + dec.i <- dec.i + 1; + dec.len <- dec.len - 1 + +let next dec = + let c = cur dec in + junk dec; + c + +(* parse value *) +let rec parse_rec dec = + match dec.stack with + | [PS_return v] -> (* return value *) + dec.stack <- []; + dec.state <- ParseOk v; + dec.state + | [PS_error s] -> (* failure *) + dec.stack <- []; + dec.state <- ParseError s; + dec.state + | _ -> + if is_empty dec then ParsePartial (* wait *) + else begin + let c = next dec in + (match dec.stack, c with + | PS_S_escape b :: stack, 'n' -> + Buffer.add_char b '\n'; + dec.stack <- PS_S b :: stack + | PS_S_escape b :: stack, 't' -> + Buffer.add_char b '\t'; + dec.stack <- PS_S b :: stack + | (PS_S_escape b) :: stack, ('(' | '\\' | ')' | ' ') -> + Buffer.add_char b c; + dec.stack <- (PS_S b) :: stack; + | (PS_key s) :: _, (')' | '\n' | ' ' | '\t') -> (* error *) + error dec ("keyword " ^ s ^ " expected value") + | _, ')' -> (* special case for ')' *) + close_paren dec + | ((PS_L _ | PS_key _) :: _ | []), '-' -> (* negative num *) + dec.stack <- PS_I (false, 0) :: dec.stack + | ((PS_L _ | PS_key _) :: _ | []), '0' .. '9' -> (* positive num *) + dec.stack <- PS_I (true, Char.code c - Char.code '0') :: dec.stack + | (PS_I (sign, i)) :: stack, '0' .. '9' -> + dec.stack <- PS_I (sign, (Char.code c - Char.code '0') + 10 * i) :: stack; + | (PS_I (sign, i)) :: stack, (' ' | '\t' | '\n') -> + terminate_token dec + | stack, '(' -> + dec.stack <- PS_L [] :: stack (* push new list *) + | PS_S b :: stack, (' ' | '\t' | '\n') -> (* parsed a string *) + terminate_token dec + | PS_S b :: stack, '\\' -> + dec.stack <- PS_S_escape b :: stack (* escape next char *) + | PS_S b :: _, _ -> + Buffer.add_char b c (* just a char of the string *) + | _, (' ' | '\t' | '\n') -> (* skip *) + () + | stack, c -> + let b = Buffer.create 7 in + Buffer.add_char b c; + dec.stack <- PS_S b :: stack + ); + parse_rec dec + end +(* When a value is parsed, push it on the stack (possibly collapsing it) *) +and push_value dec v = + match v, dec.stack with + | _, [] -> + dec.stack <- [PS_return v] (* finished *) + | _, (PS_L l) :: stack -> + (* add to list *) + dec.stack <- (PS_L (v :: l)) :: stack; + | v, ((PS_key s) :: stack) -> + (* parsed a key/value *) + dec.stack <- stack; + push_value dec (K (s, v)) + | _ -> + error dec "unexpected value" +(* closing parenthesis: may terminate several states at once *) +and close_paren dec = + match dec.stack with + | PS_L l :: stack -> + dec.stack <- stack; + push_value dec (L (List.rev l)) + | (PS_I _ | PS_S _) :: stack -> + terminate_token dec; + close_paren dec (* parenthesis still not closed *) + | _ -> + error dec "Sexp: unexpected ')'" +(* terminate current token *) +and terminate_token dec = + match dec.stack with + | [] -> assert false + | (PS_I (sign, i)) :: stack -> + dec.stack <- stack; + push_value dec (I (if sign then i else ~- i)) (* parsed int *) + | (PS_S b) :: stack -> + dec.stack <- stack; + let s = Buffer.contents b in + if s.[0] = ':' + then dec.stack <- (PS_key s) :: stack (* keyword, wait for value *) + else push_value dec (S s) + | _ -> + error dec "Sexp: ill-terminated token" +(* signal error *) +and error dec msg = + let msg = Printf.sprintf "Sexp: error at line %d, column %d: %s" + dec.l dec.c msg in + dec.stack <- [PS_error msg] + +(* exported parse function *) +let parse dec s i len = + (if i < 0 || i+len > String.length s + then invalid_arg "Sexp.parse: not a valid substring"); + (* add the input to [dec] *) + if dec.len = 0 + then begin + dec.buf <- s; + dec.i <- i; + dec.len <- len; + end else begin + (* use a buffer to merge the stored input and the new input *) + let b = Buffer.create (dec.len + len) in + Buffer.add_substring b dec.buf dec.i dec.len; + Buffer.add_substring b s i len; + dec.buf <- Buffer.contents b; + dec.i <- 0; + dec.len <- dec.len + len; + end; + (* state machine *) + parse_rec dec + +let reset dec = + dec.l <- 0; + dec.c <- 0; + dec.i <- 0; + dec.len <- 0; + dec.state <- ParsePartial; + dec.stack <- []; + () + +let state dec = dec.state + +let rest dec = + String.sub dec.buf dec.i dec.len + +let rest_size dec = + dec.len + +let parse_string s = + let dec = mk_decoder () in + parse dec s 0 (String.length s) + +let of_string s = + match parse_string s with + | ParseOk t -> t + | ParsePartial -> invalid_arg "Sexp: partial parse" + | ParseError msg -> invalid_arg msg + +(* tests: + +let s = Sexp.of_string "(0 a b c 42 :foo 45 :bar (hello-world foo\\tb\\na\\(\\)r -421) (41 -52) 0)";; +Sexp.to_string s;; +*) diff --git a/sexp.mli b/sexp.mli new file mode 100644 index 00000000..e2921285 --- /dev/null +++ b/sexp.mli @@ -0,0 +1,86 @@ +(* +Copyright (c) 2013, Simon Cruanes +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. Redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {1 Simple S-expression parsing/printing} *) + +type t = + | K of string * t (* keyword *) + | I of int + | S of string + | L of t list + +val eq : t -> t -> bool +val compare : t -> t -> int +val hash : t -> int + +(** {2 Serialization (encoding)} *) + +val to_buf : Buffer.t -> t -> unit +val to_string : t -> string +val fmt : Format.formatter -> t -> unit + +(** {2 Deserialization (decoding)} *) + +(** Deserialization is based on the {! decoder} type. Parsing can be + incremental, in which case the input is provided chunk by chunk and + the decoder contains the parsing state. Once a Sexpr value + has been parsed, other values can still be read. *) + +type decoder + (** Decoding state *) + +val mk_decoder : unit -> decoder + (** Create a new decoder *) + +type parse_result = + | ParseOk of t + | ParseError of string + | ParsePartial + +val parse : decoder -> string -> int -> int -> parse_result + (** [parse dec s i len] uses the partial state stored in [dec] and + the substring of [s] starting at index [i] with length [len]. + It can return an error, a value or just [ParsePartial] if + more input is needed *) + +val reset : decoder -> unit + (** Reset the decoder to its pristine state, ready to parse something + different. Before that, {! rest} and {! rest_size} can be used + to recover the part of the input that has not been consumed yet. *) + +val state : decoder -> parse_result + (** Current state of the decoder *) + +val rest : decoder -> string + (** What remains after parsing (the additional, unused input) *) + +val rest_size : decoder -> int + (** Length of [rest d]. 0 indicates that the whole input has been consumed. *) + +val parse_string : string -> parse_result + (** Parse a full value from this string. *) + +val of_string : string -> t + (** Parse the string. @raise Invalid_argument if it fails to parse. *)