diff --git a/src/minidag/decode.ml b/src/minidag/decode.ml new file mode 100644 index 00000000..094cffd7 --- /dev/null +++ b/src/minidag/decode.ml @@ -0,0 +1,112 @@ +type t = { str: string } +(** Decoder *) + +let create str : t = { str } + +type offset = int + +type node_decoder = { + dec: t; + mutable off: offset; +} + +type value = + | Stop (** No other value left *) + | Null + | Bool of bool + | Int64 of int64 + | Float of float + | String of string + | Blob of string + | Ref of int + +exception Fail of string * offset + +let fail off msg = raise (Fail (msg, off)) +let failf off msg = Printf.ksprintf (fail off) msg + +let[@inline] read_byte_ (self : node_decoder) : int = + let c = String.get self.dec.str self.off in + self.off <- self.off + 1; + Char.code c + +let[@inline] read_leading_ (self : node_decoder) = + let c = read_byte_ self in + c lsr 4, c land 0x0f + +let read_uint64 self ~low = + match low with + | _ when low < 12 -> Int64.of_int low + | 12 -> read_byte_ self |> Int64.of_int + | 13 -> + let n = String.get_int16_le self.dec.str self.off in + self.off <- self.off + 2; + Int64.of_int (n land 0xFFFF) + (* strip sign extension: treat as uint16 *) + | 14 -> + let n = String.get_int32_le self.dec.str self.off in + self.off <- self.off + 4; + Int64.logand (Int64.of_int32 n) 0xFFFFFFFFL (* treat as uint32 *) + | 15 -> + let n = String.get_int64_le self.dec.str self.off in + self.off <- self.off + 8; + n + | _ -> assert false + +let string_ self ~low : string = + let len = read_uint64 self ~low |> Int64.to_int in + let s = String.sub self.dec.str self.off len in + self.off <- self.off + len; + s + +let read (self : node_decoder) : value = + let off_start = self.off in + let high, low = read_leading_ self in + match high with + | 0 -> + (* make sure we can't read further *) + self.off <- String.length self.dec.str; + Stop + | 1 -> + (match low with + | 0 -> Null + | 1 -> Bool true + | 2 -> Bool false + | n -> failf off_start "invalid special: %d" n) + | 2 -> Int64 (read_uint64 self ~low) + | 3 -> Int64 (Int64.neg (read_uint64 self ~low)) + | 4 -> Float (Int64.float_of_bits (read_uint64 self ~low)) + | 5 -> String (string_ self ~low) + | 6 -> Blob (string_ self ~low) + | 7 -> Ref (read_uint64 self ~low |> Int64.to_int) + | _ -> failf off_start "invalid high: %d" high + +let read_node (self : t) (off : offset) f = + let dec = { dec = self; off } in + match read dec with + | String s -> f dec s + | _ -> fail off "expected node to start with a string" + +let iter_nodes (self : t) (f : offset -> string -> value list -> unit) : unit = + let total_len = String.length self.str in + let rec go off = + if off < total_len then ( + let dec = { dec = self; off } in + match read dec with + | String cmd -> + (* save the offset just before each read; when we see Stop, that saved + value is the Stop byte's position — next node starts one byte later *) + let stop_off = ref dec.off in + let rec collect acc = + stop_off := dec.off; + match read dec with + | Stop -> List.rev acc + | v -> collect (v :: acc) + in + let args = collect [] in + f off cmd args; + go (!stop_off + 1) + | _ -> fail off "expected string at start of node" + ) + in + go 0 diff --git a/src/minidag/decode.mli b/src/minidag/decode.mli new file mode 100644 index 00000000..a548a75e --- /dev/null +++ b/src/minidag/decode.mli @@ -0,0 +1,35 @@ +type t +(** Decoder *) + +val create : string -> t +(* TODO: on demand read *) + +type node_decoder +(** Local state to read the data for a node *) + +type offset = int + +val read_node : t -> offset -> (node_decoder -> string -> 'a) -> 'a +(** Decode node at offset. The callback gets a decoder for the arguments as well + as the command's name. *) + +exception Fail of string * offset + +type value = + | Stop (** No other value left *) + | Null + | Bool of bool + | Int64 of int64 + | Float of float + | String of string + | Blob of string + | Ref of int + +val read : node_decoder -> value +(** [read dec] returns a value. + @raise Fail in case of malformed data. *) + +val iter_nodes : t -> (offset -> string -> value list -> unit) -> unit +(** Stream forward from offset 0, calling [f offset cmd args] for each node. + [args] does not include the trailing [Stop] marker. + @raise Fail if the stream is malformed. *) diff --git a/src/minidag/design.md b/src/minidag/design.md new file mode 100644 index 00000000..b5bc5dad --- /dev/null +++ b/src/minidag/design.md @@ -0,0 +1,108 @@ + + +# minidag + +A serialization format intended to be a trimmed-down, minimalistic version of [twine](https://twine-data.dev/) +focusing on extensibility. + +We define a basic set of primitive types (scalars), a notion of _reference_ (pointer to a value coming +earlier in the byte stream), and a notion of _command_ (which builds a composite value out of scalars and +references to other values). The set of commands is user defined and provides extensibility; +the builtin scalars and references provide a uniform representation that allow tools to process +_any_ DAG in the same fashion (e.g. reachability analysis, display, or pruning). + +## data model + +The actual data model is user-defined, but must be expressible in terms of DAG nodes. +Each node has a _command_, and a list of scalar arguments. + +In other words, the bytestream contains a DAG, whose nodes have the form +`( *)`. + +Scalars comprise: `null`, booleans, int64 numbers, float32/float64, (UTF-8) strings, binary blobs, +and the notion of _reference_ (an absolute offset in the bytestream, which point to earlier to ensure +acyclicity). + +For example, lambda terms of the Simple Type Theory are represented by: + +``` +term := Var var | Lambda var term | Apply term term +var := { name: string, type: type } +type := Type_const name | Arrow type type +``` + +This can be defined with the following set of commands, where `@x` is a reference +a command returning type `x`: + +``` +"term.var" @var // returns term +"term.app" @term @term // returns term +"term.lambda" @var @term // returns term + +"var" string @type // returns var + +"type.const" string // returns type +"type.arrow" @type @type // returns type +``` + +## Wire format + +The format is designed to be read forward or backward. + +- forward: the byte stream is just a sequence of nodes, read in the same + order they were produced. +- backward: starting from a byte offset that is of interest (eg the + proof of "false" in a refutational prover), traverse the DAG backward + by following references. This can allow tools to touch only a fraction of the + nodes in the stream. + + +Each node is made of a list of scalars, followed by `0x00` (the stop marker). +The first scalar is normally a string that represents the _command_; the other scalars +are the command's arguments. + +Scalars are represented by a _leading byte_. The leading byte contains the scalar's +type, and some length or integer value information: + +``` + [ tag: 3 bits | padding: 1 bit | embedded integer: 4 bits ] + + ^--- high nibble ----------^ ^---- low nibble -------^ +``` + +Tags are as follow: + +- 0: stop (coincides with `0x00` being stop) +- 1: special (null=0, true=1, false=2) +- 2: non-negative integer, up to 64 bits. See later for where the data is. +- 3: negative integer, up to -2^63 (min int64). +- 4: float, 32 or 64 bits +- 5: string (UTF-8). The integer data is the byte length. +- 6: blob (raw binary). The integer data is the byte length. +- 7: reference. The integer data is the absolute offset in the stream. + +The integer data associated with the scalar is represented in the low nibble. + +- values from 0 to 11 represent themselves (ie low nibble=3 represents the embedded + integer 3). Values above this indicate the number of additional bytes, immediately + following the leading byte, and that contain the actul value. +- value 12: 1 byte follows, the value is that u8 +- value 13: 2 bytes follow, the value is that u16 (little endian) +- value 14: 4 bytes follow, the value is that u32 (little endian) +- value 15: 8 bytes follow, the value is that u64 (little endian) + +Each scalar type does a different thing with this embedded integer. +- stop/null: no integer (must be 0) +- bool: 0 is false, 1 is true +- non-negative integer: it's the value. +- negative integers: encode the absolute value +- floats: value is 14 or 15, followed by the 32- or 64-bits representation of the float. +- strings and blobs: the embedded integer is the number of bytes that follow with the data. +- reference: value is the absolute offset referred to + +So the string "hello" would be: `0x55 'h' 'e' 'l' 'l' 'o'`. +Integer `8 : u64` is `0x28`, integer `39 : u64` is `0x2c 39`. +a 64kiB (=0x10000 bytes) png blob would be `0x6e 0x00 0x01 0x00 0x00 `. + + + diff --git a/src/minidag/dune b/src/minidag/dune new file mode 100644 index 00000000..ab1a3a87 --- /dev/null +++ b/src/minidag/dune @@ -0,0 +1,5 @@ +(library + (name sidekick_minidag) + (public_name sidekick.minidag) + (synopsis "Minidag binary serialization format for proof traces") + (libraries)) diff --git a/src/minidag/encode.ml b/src/minidag/encode.ml new file mode 100644 index 00000000..af56578c --- /dev/null +++ b/src/minidag/encode.ml @@ -0,0 +1,141 @@ +class type output = object + method write : bytes -> int -> int -> unit +end + +type t = { + mutable buf: bytes; + mutable len: int; + out: output; + mutable cur_offset: int; (** How many bytes written into [out] already? *) +} +(** Encoder *) + +let buf_size = 1024 +let high_watermark = buf_size - 32 + +let create ~out () : t = + let out = (out :> output) in + { out; cur_offset = 0; buf = Bytes.create buf_size; len = 0 } + +let[@inline never] flush_ self : unit = + self.out#write self.buf 0 self.len; + self.cur_offset <- self.cur_offset + self.len; + self.len <- 0 + +let[@inline] abs_offset_ (self : t) : int = self.cur_offset + self.len +let flush self : unit = if self.len > 0 then flush_ self + +type node_encoder = t +type offset = int + +let maybe_flush self = if self.len >= high_watermark then flush_ self +let[@inline] buf_len self = Bytes.length self.buf + +let[@inline never] ensure_slow_ (self : t) n = + (* Flush before growing: peak memory stays bounded at buf_size + n rather + than the full accumulated output size. An alternative would be to grow + without flushing, but that would buffer the entire output in RAM. *) + if self.len + n >= high_watermark then flush self; + let cap = ref (buf_len self) in + while !cap < self.len + n do + cap := !cap + (!cap / 2) + done; + if !cap > buf_len self then ( + let newbuf = Bytes.create !cap in + Bytes.blit self.buf 0 newbuf 0 self.len; + self.buf <- newbuf + ) + +(** Ensure at least [n] bytes of free space *) +let ensure_ self n : unit = + let cap = buf_len self in + if cap < self.len + n then ensure_slow_ self n; + assert (self.len + n <= buf_len self) + +let write_leading (self : t) ~high ~low = + if low >= 12 then ensure_ self 9; + Bytes.set self.buf self.len (Char.unsafe_chr ((high lsl 4) lor low)); + self.len <- self.len + 1 + +let stop self = write_leading self ~high:0 ~low:0 +let null self = write_leading self ~high:1 ~low:0 + +let bool self b = + write_leading self ~high:1 + ~low: + (if b then + 1 + else + 2) + +let uint64_ self ~high i = + assert (i >= 0L); + if i < 12L then + write_leading self ~high ~low:(Int64.to_int i) + else if i < Int64.shift_left 1L 8 then ( + write_leading self ~high ~low:12; + Bytes.set_int8 self.buf self.len (Int64.to_int i); + self.len <- self.len + 1 + ) else if i < Int64.shift_left 1L 16 then ( + write_leading self ~high ~low:13; + Bytes.set_int16_le self.buf self.len (Int64.to_int i); + self.len <- self.len + 2 + ) else if i < Int64.shift_left 1L 32 then ( + write_leading self ~high ~low:14; + Bytes.set_int32_le self.buf self.len (Int64.to_int32 i); + self.len <- self.len + 4 + ) else ( + write_leading self ~high ~low:15; + Bytes.set_int64_le self.buf self.len i; + self.len <- self.len + 8 + ) + +let uint_ self ~high i = + if i < 12 then + write_leading self ~high ~low:i + else + uint64_ self ~high (Int64.of_int i) + +let int64 self i = + if i = Int64.min_int then + failwith "Encode.int64: Int64.min_int is not representable" + else if i < 0L then + uint64_ self ~high:3 (Int64.neg i) + else + uint64_ self ~high:2 i + +let int self i = + if i = min_int then + uint64_ self ~high:3 Int64.(abs (of_int i)) + else if i < 0 then + uint_ self ~high:3 (abs i) + else + uint_ self ~high:2 i + +let float self f = + (* always 64-bit; bits_of_float is a signed int64 so bypass uint64_ *) + write_leading self ~high:4 ~low:15; + Bytes.set_int64_le self.buf self.len (Int64.bits_of_float f); + self.len <- self.len + 8 + +let string_ self ~high s = + let len = String.length s in + uint_ self ~high len; + ensure_ self len; + Bytes.blit_string s 0 self.buf self.len len; + self.len <- self.len + len + +let string self s = string_ self ~high:5 s +let blob self s = string_ self ~high:6 s + +let ref self i = + assert (i < abs_offset_ self); + uint_ self ~high:7 i + +let write_node (self : t) cmd (f : node_encoder -> unit) : offset = + let offset = abs_offset_ self in + string self cmd; + f self; + stop self; + maybe_flush self; + offset diff --git a/src/minidag/encode.mli b/src/minidag/encode.mli new file mode 100644 index 00000000..1a80ac37 --- /dev/null +++ b/src/minidag/encode.mli @@ -0,0 +1,30 @@ +type t +(** Encoder *) + +class type output = object + method write : bytes -> int -> int -> unit +end + +val create : out:#output -> unit -> t + +val flush : t -> unit +(** Write all data to the internal output *) + +type node_encoder +(** Local state to write the data for a node *) + +type offset = private int + +val write_node : t -> string -> (node_encoder -> unit) -> offset +(** [write_node enc command f] starts a new node with "command", calls [f] to + add the arguments, and returns the offset of this new node. The node encoder + must not escape. *) + +val null : node_encoder -> unit +val bool : node_encoder -> bool -> unit +val int64 : node_encoder -> int64 -> unit +val int : node_encoder -> int -> unit +val float : node_encoder -> float -> unit +val string : node_encoder -> string -> unit +val blob : node_encoder -> string -> unit +val ref : node_encoder -> offset -> unit