diff --git a/bencode.ml b/bencode.ml new file mode 100644 index 00000000..a8c1fec5 --- /dev/null +++ b/bencode.ml @@ -0,0 +1,269 @@ +(* +Copyright (c) 2013, Simon Cruanes +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. Redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {6 B-encoding} *) + + +module SMap = Map.Make(String) + +type t = + | I of int + | S of string + | L of t list + | D of t SMap.t + +let rec eq t1 t2 = match t1, t2 with + | I i1, I i2 -> i1 = i2 + | S s1, S s2 -> s1 = s2 + | L l1, L l2 -> + (try List.for_all2 eq l1 l2 with Invalid_argument _ -> false) + | D d1, D d2 -> + SMap.equal eq d1 d2 + | _ -> false + +let hash t = Hashtbl.hash t + +let dict_of_list l = + let d = List.fold_left + (fun d (k, v) -> SMap.add k v d) + SMap.empty l + in + D d + +(** {2 Serialization (encoding)} *) + +let rec to_buf buf t = match t with + | I i -> Printf.bprintf buf "i%de" i + | S s -> + Printf.bprintf buf "%d:" (String.length s); + Buffer.add_string buf s + | L l -> + Buffer.add_char buf 'l'; + List.iter (fun t' -> to_buf buf t') l; + Buffer.add_char buf 'e' + | D m -> + Buffer.add_char buf 'd'; + SMap.iter (fun key t' -> to_buf buf (S key); to_buf buf t') m; + Buffer.add_char buf 'e' + +let to_string t = + let b = Buffer.create 25 in + to_buf b t; + Buffer.contents b + +let to_chan ch t = + let b = Buffer.create 25 in + to_buf b t; + Buffer.output_buffer ch b + +let fmt formatter t = + let b = Buffer.create 25 in + to_buf b t; + Format.pp_print_string formatter (Buffer.contents b) + +(** {2 Deserialization (decoding)} *) + +(** Deserialization is based on the {! decoder} type. Parsing can be + incremental, in which case the input is provided chunk by chunk and + the decoder contains the parsing state. Once a B-encoded value + has been parsed, other values can still be read. *) + +type decoder = { + mutable buf : string; (* buffer *) + mutable i : int; (* index in buf *) + mutable len : int; (* length of substring to read *) + mutable c : int; (* line *) + mutable l : int; (* column *) + mutable state : parse_result; + mutable stack : partial_state list; +} + +(** Result of parsing *) +and parse_result = + | ParseOk of t + | ParseError of string + | ParsePartial + +(** Partial state of the parser *) +and partial_state = + | PS_I of bool * int (* sign and integer *) + | PS_S of int ref * string (* index in string, plus string *) + | PS_L of t list + | PS_D of t SMap.t (* in dictionary *) + | PS_D_key of string * t SMap.t (* parsed key, wait for value *) + | PS_return of t (* bottom of stack *) + | PS_error of string (* error *) + +let mk_decoder () = + let dec = { + buf = ""; + i = 0; + len = 0; + c = 0; + l = 0; + state = ParsePartial; + stack = []; + } in + dec + +let is_empty dec = dec.len = 0 +let cur dec = dec.buf.[dec.i] + +let junk dec = + (* update line/column *) + (if cur dec = '\n' + then (dec.c <- 0; dec.l <- dec.l + 1) + else dec.c <- dec.c + 1); + dec.i <- dec.i + 1; + dec.len <- dec.len - 1 + +let next dec = + let c = cur dec in + junk dec; + c + +(* parse value *) +let rec parse_rec dec = + match dec.stack with + | [PS_return v] -> (* return value *) + dec.stack <- []; + dec.state <- ParseOk v; + dec.state + | [PS_error s] -> (* failure *) + dec.stack <- []; + dec.state <- ParseError s; + dec.state + | _ -> + if is_empty dec then ParsePartial (* wait *) + else begin + let c = next dec in + (match dec.stack, c with + | (PS_I (sign, i)) :: stack, '0' .. '9' -> + dec.stack <- PS_I (sign, (Char.code c - Char.code '0') + 10 * i) :: stack; + | (PS_I (_, 0)) :: stack, '-' -> + dec.stack <- PS_I (false, 0) :: stack (* negative number *) + | (PS_I (sign, i)) :: stack, 'e' -> + dec.stack <- stack; + push_value dec (I (if sign then i else ~- i)) + | ((PS_D _ | PS_D_key _ | PS_L _) :: _ | []), '0' .. '9' -> + (* initial length of string *) + dec.stack <- (PS_I (true, Char.code c - Char.code '0')) :: dec.stack + | (PS_I (sign, i)) :: stack, ':' -> + if i < 0 + then error dec "string length cannot be negative" + else if i = 0 then (* empty string *) + let _ = dec.stack <- stack in + push_value dec (S "") + else (* prepare to parse a string *) + dec.stack <- (PS_S (ref 0, String.create i)) :: stack; + | (PS_S (n, s)) :: stack, _ -> + s.[!n] <- c; + incr n; + (* value completed *) + (if !n = String.length s + then + let _ = dec.stack <- stack in + push_value dec (S s)); + | stack, 'i' -> + dec.stack <- (PS_I (true, 0)) :: stack + | stack, 'l' -> + dec.stack <- PS_L [] :: stack; + | stack, 'd' -> + dec.stack <- PS_D SMap.empty :: stack + | (PS_L l) :: stack, 'e' -> (* end of list *) + dec.stack <- stack; + push_value dec (L (List.rev l)) + | (PS_D d) :: stack, 'e' -> (* end of dict *) + dec.stack <- stack; + push_value dec (D d) + | (PS_D_key _) :: _, 'e' -> (* error *) + error dec "missing value in dict" + | _ -> (* generic error *) + error dec (Printf.sprintf "expected value, got %c" c)); + parse_rec dec + end +(* When a value is parsed, push it on the stack (possibly collapsing it) *) +and push_value dec v = + match v, dec.stack with + | _, [] -> + dec.stack <- [PS_return v] (* finished *) + | _, (PS_L l) :: stack -> + (* add to list *) + dec.stack <- (PS_L (v :: l)) :: stack; + | S key, ((PS_D d) :: stack) -> + (* new key for the map *) + dec.stack <- (PS_D_key (key, d)) :: stack; + | _, ((PS_D d) :: _) -> + (* error: key must be string *) + error dec "dict keys must be strings" + | _, (PS_D_key (key, d)) :: stack -> + (* new binding for the map *) + dec.stack <- (PS_D (SMap.add key v d)) :: stack; + | _ -> assert false +(* signal error *) +and error dec msg = + let msg = Printf.sprintf "Bencode: error at line %d, column %d: %s" + dec.l dec.c msg in + dec.stack <- [PS_error msg] + +(* exported parse function *) +let parse dec s i len = + (if i < 0 || i+len > String.length s + then invalid_arg "Bencode.parse: not a valid substring"); + (* add the input to [dec] *) + if dec.len = 0 + then begin + dec.buf <- s; + dec.i <- i; + dec.len <- len; + end else begin + (* use a buffer to merge the stored input and the new input *) + let b = Buffer.create (dec.len + len) in + Buffer.add_substring b dec.buf dec.i dec.len; + Buffer.add_substring b s i len; + dec.buf <- Buffer.contents b; + dec.i <- 0; + dec.len <- dec.len + len; + end; + (* state machine *) + parse_rec dec + +let state dec = dec.state + +let rest dec = + String.sub dec.buf dec.i dec.len + +let rest_size dec = + dec.len + +let parse_string s = + let dec = mk_decoder () in + parse dec s 0 (String.length s) + +let of_string s = + match parse_string s with + | ParseOk t -> t + | ParsePartial -> invalid_arg "Bencode: partial parse" + | ParseError msg -> invalid_arg msg diff --git a/bencode.mli b/bencode.mli new file mode 100644 index 00000000..3cb8c1f9 --- /dev/null +++ b/bencode.mli @@ -0,0 +1,92 @@ +(* +Copyright (c) 2013, Simon Cruanes +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. Redistributions in binary +form must reproduce the above copyright notice, this list of conditions and the +following disclaimer in the documentation and/or other materials provided with +the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*) + +(** {6 B-encoding} *) + +(** This implements encoding and decoding using the {i B-encode} format. + See {{: http://en.wikipedia.org/wiki/Bencode} wikipedia} for more details + *) + +module SMap : Map.S with type key = string + +type t = + | I of int + | S of string + | L of t list + | D of t SMap.t + +val eq : t -> t -> bool +val hash : t -> int + +val dict_of_list : (string * t) list -> t + +(** {2 Serialization (encoding)} *) + +val to_buf : Buffer.t -> t -> unit +val to_string : t -> string +val to_chan : out_channel -> t -> unit +val fmt : Format.formatter -> t -> unit + +(** {2 Deserialization (decoding)} *) + +(** Deserialization is based on the {! decoder} type. Parsing can be + incremental, in which case the input is provided chunk by chunk and + the decoder contains the parsing state. Once a B-encoded value + has been parsed, other values can still be read. + + This implementation does accept leading zeros, because it simplifies + the code. *) + +type decoder + (** Decoding state *) + +val mk_decoder : unit -> decoder + (** Create a new decoder *) + +type parse_result = + | ParseOk of t + | ParseError of string + | ParsePartial + +val parse : decoder -> string -> int -> int -> parse_result + (** [parse dec s i len] uses the partial state stored in [dec] and + the substring of [s] starting at index [i] with length [len]. + It can return an error, a value or just [ParsePartial] if + more input is needed *) + +val state : decoder -> parse_result + (** Current state of the decoder *) + +val rest : decoder -> string + (** What remains after parsing (the additional, unused input) *) + +val rest_size : decoder -> int + (** Length of [rest d]. 0 indicates that the whole input has been consumed. *) + +val parse_string : string -> parse_result + (** Parse a full value from this string. *) + +val of_string : string -> t + (** Parse the string. @raise Invalid_argument if it fails to parse. *) diff --git a/tests/test_bencode.ml b/tests/test_bencode.ml new file mode 100644 index 00000000..17ebfd8c --- /dev/null +++ b/tests/test_bencode.ml @@ -0,0 +1,46 @@ + +open OUnit + +module B = Bencode + +let test1 () = + let s = "li42ei0ei-200ee" in + match B.parse_string s with + | B.ParseError msg -> + OUnit.assert_failure (Printf.sprintf "should parse, got %s" msg) + | B.ParsePartial -> + OUnit.assert_failure "should parse, got partial" + | B.ParseOk b -> + OUnit.assert_equal (B.L [B.I 42; B.I 0; B.I ~-200]) b + +let test2 () = + let b = + B.dict_of_list [ + "foo", B.I 42; + "bar", B.L [B.I 0; B.S "caramba si"]; + "", B.S ""; + ] + in + let s = B.to_string b in + (* Printf.printf "serialized to %s\n" s; *) + let b' = B.of_string s in + OUnit.assert_equal ~cmp:B.eq ~printer:B.to_string b b' + +let test3 () = + let b = B.dict_of_list [ + "a", B.I 1; + "b", B.S "bbbb"; + "l", B.L [B.I 0; B.I 0; B.S "zero\n\t \x00"]; + "d", B.dict_of_list ["foo", B.S "bar"]; + ] in + let s = B.to_string b in + (* Printf.printf "serialized to %s\n" s; *) + let b' = B.of_string s in + OUnit.assert_equal ~cmp:B.eq ~printer:B.to_string b b' + +let suite = + "test_bencode" >::: + [ "test1" >:: test1; + "test2" >:: test2; + "test3" >:: test3; + ]