port minidag encode/decode from granite (encoding part only)

This commit is contained in:
Simon Cruanes 2026-05-03 19:18:37 +00:00
parent 0c965aec23
commit aa53ae601a
6 changed files with 431 additions and 0 deletions

112
src/minidag/decode.ml Normal file
View file

@ -0,0 +1,112 @@
type t = { str: string }
(** Decoder *)
let create str : t = { str }
type offset = int
type node_decoder = {
dec: t;
mutable off: offset;
}
type value =
| Stop (** No other value left *)
| Null
| Bool of bool
| Int64 of int64
| Float of float
| String of string
| Blob of string
| Ref of int
exception Fail of string * offset
let fail off msg = raise (Fail (msg, off))
let failf off msg = Printf.ksprintf (fail off) msg
let[@inline] read_byte_ (self : node_decoder) : int =
let c = String.get self.dec.str self.off in
self.off <- self.off + 1;
Char.code c
let[@inline] read_leading_ (self : node_decoder) =
let c = read_byte_ self in
c lsr 4, c land 0x0f
let read_uint64 self ~low =
match low with
| _ when low < 12 -> Int64.of_int low
| 12 -> read_byte_ self |> Int64.of_int
| 13 ->
let n = String.get_int16_le self.dec.str self.off in
self.off <- self.off + 2;
Int64.of_int (n land 0xFFFF)
(* strip sign extension: treat as uint16 *)
| 14 ->
let n = String.get_int32_le self.dec.str self.off in
self.off <- self.off + 4;
Int64.logand (Int64.of_int32 n) 0xFFFFFFFFL (* treat as uint32 *)
| 15 ->
let n = String.get_int64_le self.dec.str self.off in
self.off <- self.off + 8;
n
| _ -> assert false
let string_ self ~low : string =
let len = read_uint64 self ~low |> Int64.to_int in
let s = String.sub self.dec.str self.off len in
self.off <- self.off + len;
s
let read (self : node_decoder) : value =
let off_start = self.off in
let high, low = read_leading_ self in
match high with
| 0 ->
(* make sure we can't read further *)
self.off <- String.length self.dec.str;
Stop
| 1 ->
(match low with
| 0 -> Null
| 1 -> Bool true
| 2 -> Bool false
| n -> failf off_start "invalid special: %d" n)
| 2 -> Int64 (read_uint64 self ~low)
| 3 -> Int64 (Int64.neg (read_uint64 self ~low))
| 4 -> Float (Int64.float_of_bits (read_uint64 self ~low))
| 5 -> String (string_ self ~low)
| 6 -> Blob (string_ self ~low)
| 7 -> Ref (read_uint64 self ~low |> Int64.to_int)
| _ -> failf off_start "invalid high: %d" high
let read_node (self : t) (off : offset) f =
let dec = { dec = self; off } in
match read dec with
| String s -> f dec s
| _ -> fail off "expected node to start with a string"
let iter_nodes (self : t) (f : offset -> string -> value list -> unit) : unit =
let total_len = String.length self.str in
let rec go off =
if off < total_len then (
let dec = { dec = self; off } in
match read dec with
| String cmd ->
(* save the offset just before each read; when we see Stop, that saved
value is the Stop byte's position next node starts one byte later *)
let stop_off = ref dec.off in
let rec collect acc =
stop_off := dec.off;
match read dec with
| Stop -> List.rev acc
| v -> collect (v :: acc)
in
let args = collect [] in
f off cmd args;
go (!stop_off + 1)
| _ -> fail off "expected string at start of node"
)
in
go 0

35
src/minidag/decode.mli Normal file
View file

@ -0,0 +1,35 @@
type t
(** Decoder *)
val create : string -> t
(* TODO: on demand read *)
type node_decoder
(** Local state to read the data for a node *)
type offset = int
val read_node : t -> offset -> (node_decoder -> string -> 'a) -> 'a
(** Decode node at offset. The callback gets a decoder for the arguments as well
as the command's name. *)
exception Fail of string * offset
type value =
| Stop (** No other value left *)
| Null
| Bool of bool
| Int64 of int64
| Float of float
| String of string
| Blob of string
| Ref of int
val read : node_decoder -> value
(** [read dec] returns a value.
@raise Fail in case of malformed data. *)
val iter_nodes : t -> (offset -> string -> value list -> unit) -> unit
(** Stream forward from offset 0, calling [f offset cmd args] for each node.
[args] does not include the trailing [Stop] marker.
@raise Fail if the stream is malformed. *)

108
src/minidag/design.md Normal file
View file

@ -0,0 +1,108 @@
# minidag
A serialization format intended to be a trimmed-down, minimalistic version of [twine](https://twine-data.dev/)
focusing on extensibility.
We define a basic set of primitive types (scalars), a notion of _reference_ (pointer to a value coming
earlier in the byte stream), and a notion of _command_ (which builds a composite value out of scalars and
references to other values). The set of commands is user defined and provides extensibility;
the builtin scalars and references provide a uniform representation that allow tools to process
_any_ DAG in the same fashion (e.g. reachability analysis, display, or pruning).
## data model
The actual data model is user-defined, but must be expressible in terms of DAG nodes.
Each node has a _command_, and a list of scalar arguments.
In other words, the bytestream contains a DAG, whose nodes have the form
`(<command> <scalar>*)`.
Scalars comprise: `null`, booleans, int64 numbers, float32/float64, (UTF-8) strings, binary blobs,
and the notion of _reference_ (an absolute offset in the bytestream, which point to earlier to ensure
acyclicity).
For example, lambda terms of the Simple Type Theory are represented by:
```
term := Var var | Lambda var term | Apply term term
var := { name: string, type: type }
type := Type_const name | Arrow type type
```
This can be defined with the following set of commands, where `@x` is a reference
a command returning type `x`:
```
"term.var" @var // returns term
"term.app" @term @term // returns term
"term.lambda" @var @term // returns term
"var" string @type // returns var
"type.const" string // returns type
"type.arrow" @type @type // returns type
```
## Wire format
The format is designed to be read forward or backward.
- forward: the byte stream is just a sequence of nodes, read in the same
order they were produced.
- backward: starting from a byte offset that is of interest (eg the
proof of "false" in a refutational prover), traverse the DAG backward
by following references. This can allow tools to touch only a fraction of the
nodes in the stream.
Each node is made of a list of scalars, followed by `0x00` (the stop marker).
The first scalar is normally a string that represents the _command_; the other scalars
are the command's arguments.
Scalars are represented by a _leading byte_. The leading byte contains the scalar's
type, and some length or integer value information:
```
[ tag: 3 bits | padding: 1 bit | embedded integer: 4 bits ]
^--- high nibble ----------^ ^---- low nibble -------^
```
Tags are as follow:
- 0: stop (coincides with `0x00` being stop)
- 1: special (null=0, true=1, false=2)
- 2: non-negative integer, up to 64 bits. See later for where the data is.
- 3: negative integer, up to -2^63 (min int64).
- 4: float, 32 or 64 bits
- 5: string (UTF-8). The integer data is the byte length.
- 6: blob (raw binary). The integer data is the byte length.
- 7: reference. The integer data is the absolute offset in the stream.
The integer data associated with the scalar is represented in the low nibble.
- values from 0 to 11 represent themselves (ie low nibble=3 represents the embedded
integer 3). Values above this indicate the number of additional bytes, immediately
following the leading byte, and that contain the actul value.
- value 12: 1 byte follows, the value is that u8
- value 13: 2 bytes follow, the value is that u16 (little endian)
- value 14: 4 bytes follow, the value is that u32 (little endian)
- value 15: 8 bytes follow, the value is that u64 (little endian)
Each scalar type does a different thing with this embedded integer.
- stop/null: no integer (must be 0)
- bool: 0 is false, 1 is true
- non-negative integer: it's the value.
- negative integers: encode the absolute value
- floats: value is 14 or 15, followed by the 32- or 64-bits representation of the float.
- strings and blobs: the embedded integer is the number of bytes that follow with the data.
- reference: value is the absolute offset referred to
So the string "hello" would be: `0x55 'h' 'e' 'l' 'l' 'o'`.
Integer `8 : u64` is `0x28`, integer `39 : u64` is `0x2c 39`.
a 64kiB (=0x10000 bytes) png blob would be `0x6e 0x00 0x01 0x00 0x00 <the png bytes>`.

5
src/minidag/dune Normal file
View file

@ -0,0 +1,5 @@
(library
(name sidekick_minidag)
(public_name sidekick.minidag)
(synopsis "Minidag binary serialization format for proof traces")
(libraries))

141
src/minidag/encode.ml Normal file
View file

@ -0,0 +1,141 @@
class type output = object
method write : bytes -> int -> int -> unit
end
type t = {
mutable buf: bytes;
mutable len: int;
out: output;
mutable cur_offset: int; (** How many bytes written into [out] already? *)
}
(** Encoder *)
let buf_size = 1024
let high_watermark = buf_size - 32
let create ~out () : t =
let out = (out :> output) in
{ out; cur_offset = 0; buf = Bytes.create buf_size; len = 0 }
let[@inline never] flush_ self : unit =
self.out#write self.buf 0 self.len;
self.cur_offset <- self.cur_offset + self.len;
self.len <- 0
let[@inline] abs_offset_ (self : t) : int = self.cur_offset + self.len
let flush self : unit = if self.len > 0 then flush_ self
type node_encoder = t
type offset = int
let maybe_flush self = if self.len >= high_watermark then flush_ self
let[@inline] buf_len self = Bytes.length self.buf
let[@inline never] ensure_slow_ (self : t) n =
(* Flush before growing: peak memory stays bounded at buf_size + n rather
than the full accumulated output size. An alternative would be to grow
without flushing, but that would buffer the entire output in RAM. *)
if self.len + n >= high_watermark then flush self;
let cap = ref (buf_len self) in
while !cap < self.len + n do
cap := !cap + (!cap / 2)
done;
if !cap > buf_len self then (
let newbuf = Bytes.create !cap in
Bytes.blit self.buf 0 newbuf 0 self.len;
self.buf <- newbuf
)
(** Ensure at least [n] bytes of free space *)
let ensure_ self n : unit =
let cap = buf_len self in
if cap < self.len + n then ensure_slow_ self n;
assert (self.len + n <= buf_len self)
let write_leading (self : t) ~high ~low =
if low >= 12 then ensure_ self 9;
Bytes.set self.buf self.len (Char.unsafe_chr ((high lsl 4) lor low));
self.len <- self.len + 1
let stop self = write_leading self ~high:0 ~low:0
let null self = write_leading self ~high:1 ~low:0
let bool self b =
write_leading self ~high:1
~low:
(if b then
1
else
2)
let uint64_ self ~high i =
assert (i >= 0L);
if i < 12L then
write_leading self ~high ~low:(Int64.to_int i)
else if i < Int64.shift_left 1L 8 then (
write_leading self ~high ~low:12;
Bytes.set_int8 self.buf self.len (Int64.to_int i);
self.len <- self.len + 1
) else if i < Int64.shift_left 1L 16 then (
write_leading self ~high ~low:13;
Bytes.set_int16_le self.buf self.len (Int64.to_int i);
self.len <- self.len + 2
) else if i < Int64.shift_left 1L 32 then (
write_leading self ~high ~low:14;
Bytes.set_int32_le self.buf self.len (Int64.to_int32 i);
self.len <- self.len + 4
) else (
write_leading self ~high ~low:15;
Bytes.set_int64_le self.buf self.len i;
self.len <- self.len + 8
)
let uint_ self ~high i =
if i < 12 then
write_leading self ~high ~low:i
else
uint64_ self ~high (Int64.of_int i)
let int64 self i =
if i = Int64.min_int then
failwith "Encode.int64: Int64.min_int is not representable"
else if i < 0L then
uint64_ self ~high:3 (Int64.neg i)
else
uint64_ self ~high:2 i
let int self i =
if i = min_int then
uint64_ self ~high:3 Int64.(abs (of_int i))
else if i < 0 then
uint_ self ~high:3 (abs i)
else
uint_ self ~high:2 i
let float self f =
(* always 64-bit; bits_of_float is a signed int64 so bypass uint64_ *)
write_leading self ~high:4 ~low:15;
Bytes.set_int64_le self.buf self.len (Int64.bits_of_float f);
self.len <- self.len + 8
let string_ self ~high s =
let len = String.length s in
uint_ self ~high len;
ensure_ self len;
Bytes.blit_string s 0 self.buf self.len len;
self.len <- self.len + len
let string self s = string_ self ~high:5 s
let blob self s = string_ self ~high:6 s
let ref self i =
assert (i < abs_offset_ self);
uint_ self ~high:7 i
let write_node (self : t) cmd (f : node_encoder -> unit) : offset =
let offset = abs_offset_ self in
string self cmd;
f self;
stop self;
maybe_flush self;
offset

30
src/minidag/encode.mli Normal file
View file

@ -0,0 +1,30 @@
type t
(** Encoder *)
class type output = object
method write : bytes -> int -> int -> unit
end
val create : out:#output -> unit -> t
val flush : t -> unit
(** Write all data to the internal output *)
type node_encoder
(** Local state to write the data for a node *)
type offset = private int
val write_node : t -> string -> (node_encoder -> unit) -> offset
(** [write_node enc command f] starts a new node with "command", calls [f] to
add the arguments, and returns the offset of this new node. The node encoder
must not escape. *)
val null : node_encoder -> unit
val bool : node_encoder -> bool -> unit
val int64 : node_encoder -> int64 -> unit
val int : node_encoder -> int -> unit
val float : node_encoder -> float -> unit
val string : node_encoder -> string -> unit
val blob : node_encoder -> string -> unit
val ref : node_encoder -> offset -> unit