mirror of
https://github.com/c-cube/sidekick.git
synced 2026-05-05 08:54:24 -04:00
port minidag encode/decode from granite (encoding part only)
This commit is contained in:
parent
0c965aec23
commit
aa53ae601a
6 changed files with 431 additions and 0 deletions
112
src/minidag/decode.ml
Normal file
112
src/minidag/decode.ml
Normal file
|
|
@ -0,0 +1,112 @@
|
|||
type t = { str: string }
|
||||
(** Decoder *)
|
||||
|
||||
let create str : t = { str }
|
||||
|
||||
type offset = int
|
||||
|
||||
type node_decoder = {
|
||||
dec: t;
|
||||
mutable off: offset;
|
||||
}
|
||||
|
||||
type value =
|
||||
| Stop (** No other value left *)
|
||||
| Null
|
||||
| Bool of bool
|
||||
| Int64 of int64
|
||||
| Float of float
|
||||
| String of string
|
||||
| Blob of string
|
||||
| Ref of int
|
||||
|
||||
exception Fail of string * offset
|
||||
|
||||
let fail off msg = raise (Fail (msg, off))
|
||||
let failf off msg = Printf.ksprintf (fail off) msg
|
||||
|
||||
let[@inline] read_byte_ (self : node_decoder) : int =
|
||||
let c = String.get self.dec.str self.off in
|
||||
self.off <- self.off + 1;
|
||||
Char.code c
|
||||
|
||||
let[@inline] read_leading_ (self : node_decoder) =
|
||||
let c = read_byte_ self in
|
||||
c lsr 4, c land 0x0f
|
||||
|
||||
let read_uint64 self ~low =
|
||||
match low with
|
||||
| _ when low < 12 -> Int64.of_int low
|
||||
| 12 -> read_byte_ self |> Int64.of_int
|
||||
| 13 ->
|
||||
let n = String.get_int16_le self.dec.str self.off in
|
||||
self.off <- self.off + 2;
|
||||
Int64.of_int (n land 0xFFFF)
|
||||
(* strip sign extension: treat as uint16 *)
|
||||
| 14 ->
|
||||
let n = String.get_int32_le self.dec.str self.off in
|
||||
self.off <- self.off + 4;
|
||||
Int64.logand (Int64.of_int32 n) 0xFFFFFFFFL (* treat as uint32 *)
|
||||
| 15 ->
|
||||
let n = String.get_int64_le self.dec.str self.off in
|
||||
self.off <- self.off + 8;
|
||||
n
|
||||
| _ -> assert false
|
||||
|
||||
let string_ self ~low : string =
|
||||
let len = read_uint64 self ~low |> Int64.to_int in
|
||||
let s = String.sub self.dec.str self.off len in
|
||||
self.off <- self.off + len;
|
||||
s
|
||||
|
||||
let read (self : node_decoder) : value =
|
||||
let off_start = self.off in
|
||||
let high, low = read_leading_ self in
|
||||
match high with
|
||||
| 0 ->
|
||||
(* make sure we can't read further *)
|
||||
self.off <- String.length self.dec.str;
|
||||
Stop
|
||||
| 1 ->
|
||||
(match low with
|
||||
| 0 -> Null
|
||||
| 1 -> Bool true
|
||||
| 2 -> Bool false
|
||||
| n -> failf off_start "invalid special: %d" n)
|
||||
| 2 -> Int64 (read_uint64 self ~low)
|
||||
| 3 -> Int64 (Int64.neg (read_uint64 self ~low))
|
||||
| 4 -> Float (Int64.float_of_bits (read_uint64 self ~low))
|
||||
| 5 -> String (string_ self ~low)
|
||||
| 6 -> Blob (string_ self ~low)
|
||||
| 7 -> Ref (read_uint64 self ~low |> Int64.to_int)
|
||||
| _ -> failf off_start "invalid high: %d" high
|
||||
|
||||
let read_node (self : t) (off : offset) f =
|
||||
let dec = { dec = self; off } in
|
||||
match read dec with
|
||||
| String s -> f dec s
|
||||
| _ -> fail off "expected node to start with a string"
|
||||
|
||||
let iter_nodes (self : t) (f : offset -> string -> value list -> unit) : unit =
|
||||
let total_len = String.length self.str in
|
||||
let rec go off =
|
||||
if off < total_len then (
|
||||
let dec = { dec = self; off } in
|
||||
match read dec with
|
||||
| String cmd ->
|
||||
(* save the offset just before each read; when we see Stop, that saved
|
||||
value is the Stop byte's position — next node starts one byte later *)
|
||||
let stop_off = ref dec.off in
|
||||
let rec collect acc =
|
||||
stop_off := dec.off;
|
||||
match read dec with
|
||||
| Stop -> List.rev acc
|
||||
| v -> collect (v :: acc)
|
||||
in
|
||||
let args = collect [] in
|
||||
f off cmd args;
|
||||
go (!stop_off + 1)
|
||||
| _ -> fail off "expected string at start of node"
|
||||
)
|
||||
in
|
||||
go 0
|
||||
35
src/minidag/decode.mli
Normal file
35
src/minidag/decode.mli
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
type t
|
||||
(** Decoder *)
|
||||
|
||||
val create : string -> t
|
||||
(* TODO: on demand read *)
|
||||
|
||||
type node_decoder
|
||||
(** Local state to read the data for a node *)
|
||||
|
||||
type offset = int
|
||||
|
||||
val read_node : t -> offset -> (node_decoder -> string -> 'a) -> 'a
|
||||
(** Decode node at offset. The callback gets a decoder for the arguments as well
|
||||
as the command's name. *)
|
||||
|
||||
exception Fail of string * offset
|
||||
|
||||
type value =
|
||||
| Stop (** No other value left *)
|
||||
| Null
|
||||
| Bool of bool
|
||||
| Int64 of int64
|
||||
| Float of float
|
||||
| String of string
|
||||
| Blob of string
|
||||
| Ref of int
|
||||
|
||||
val read : node_decoder -> value
|
||||
(** [read dec] returns a value.
|
||||
@raise Fail in case of malformed data. *)
|
||||
|
||||
val iter_nodes : t -> (offset -> string -> value list -> unit) -> unit
|
||||
(** Stream forward from offset 0, calling [f offset cmd args] for each node.
|
||||
[args] does not include the trailing [Stop] marker.
|
||||
@raise Fail if the stream is malformed. *)
|
||||
108
src/minidag/design.md
Normal file
108
src/minidag/design.md
Normal file
|
|
@ -0,0 +1,108 @@
|
|||
|
||||
|
||||
# minidag
|
||||
|
||||
A serialization format intended to be a trimmed-down, minimalistic version of [twine](https://twine-data.dev/)
|
||||
focusing on extensibility.
|
||||
|
||||
We define a basic set of primitive types (scalars), a notion of _reference_ (pointer to a value coming
|
||||
earlier in the byte stream), and a notion of _command_ (which builds a composite value out of scalars and
|
||||
references to other values). The set of commands is user defined and provides extensibility;
|
||||
the builtin scalars and references provide a uniform representation that allow tools to process
|
||||
_any_ DAG in the same fashion (e.g. reachability analysis, display, or pruning).
|
||||
|
||||
## data model
|
||||
|
||||
The actual data model is user-defined, but must be expressible in terms of DAG nodes.
|
||||
Each node has a _command_, and a list of scalar arguments.
|
||||
|
||||
In other words, the bytestream contains a DAG, whose nodes have the form
|
||||
`(<command> <scalar>*)`.
|
||||
|
||||
Scalars comprise: `null`, booleans, int64 numbers, float32/float64, (UTF-8) strings, binary blobs,
|
||||
and the notion of _reference_ (an absolute offset in the bytestream, which point to earlier to ensure
|
||||
acyclicity).
|
||||
|
||||
For example, lambda terms of the Simple Type Theory are represented by:
|
||||
|
||||
```
|
||||
term := Var var | Lambda var term | Apply term term
|
||||
var := { name: string, type: type }
|
||||
type := Type_const name | Arrow type type
|
||||
```
|
||||
|
||||
This can be defined with the following set of commands, where `@x` is a reference
|
||||
a command returning type `x`:
|
||||
|
||||
```
|
||||
"term.var" @var // returns term
|
||||
"term.app" @term @term // returns term
|
||||
"term.lambda" @var @term // returns term
|
||||
|
||||
"var" string @type // returns var
|
||||
|
||||
"type.const" string // returns type
|
||||
"type.arrow" @type @type // returns type
|
||||
```
|
||||
|
||||
## Wire format
|
||||
|
||||
The format is designed to be read forward or backward.
|
||||
|
||||
- forward: the byte stream is just a sequence of nodes, read in the same
|
||||
order they were produced.
|
||||
- backward: starting from a byte offset that is of interest (eg the
|
||||
proof of "false" in a refutational prover), traverse the DAG backward
|
||||
by following references. This can allow tools to touch only a fraction of the
|
||||
nodes in the stream.
|
||||
|
||||
|
||||
Each node is made of a list of scalars, followed by `0x00` (the stop marker).
|
||||
The first scalar is normally a string that represents the _command_; the other scalars
|
||||
are the command's arguments.
|
||||
|
||||
Scalars are represented by a _leading byte_. The leading byte contains the scalar's
|
||||
type, and some length or integer value information:
|
||||
|
||||
```
|
||||
[ tag: 3 bits | padding: 1 bit | embedded integer: 4 bits ]
|
||||
|
||||
^--- high nibble ----------^ ^---- low nibble -------^
|
||||
```
|
||||
|
||||
Tags are as follow:
|
||||
|
||||
- 0: stop (coincides with `0x00` being stop)
|
||||
- 1: special (null=0, true=1, false=2)
|
||||
- 2: non-negative integer, up to 64 bits. See later for where the data is.
|
||||
- 3: negative integer, up to -2^63 (min int64).
|
||||
- 4: float, 32 or 64 bits
|
||||
- 5: string (UTF-8). The integer data is the byte length.
|
||||
- 6: blob (raw binary). The integer data is the byte length.
|
||||
- 7: reference. The integer data is the absolute offset in the stream.
|
||||
|
||||
The integer data associated with the scalar is represented in the low nibble.
|
||||
|
||||
- values from 0 to 11 represent themselves (ie low nibble=3 represents the embedded
|
||||
integer 3). Values above this indicate the number of additional bytes, immediately
|
||||
following the leading byte, and that contain the actul value.
|
||||
- value 12: 1 byte follows, the value is that u8
|
||||
- value 13: 2 bytes follow, the value is that u16 (little endian)
|
||||
- value 14: 4 bytes follow, the value is that u32 (little endian)
|
||||
- value 15: 8 bytes follow, the value is that u64 (little endian)
|
||||
|
||||
Each scalar type does a different thing with this embedded integer.
|
||||
- stop/null: no integer (must be 0)
|
||||
- bool: 0 is false, 1 is true
|
||||
- non-negative integer: it's the value.
|
||||
- negative integers: encode the absolute value
|
||||
- floats: value is 14 or 15, followed by the 32- or 64-bits representation of the float.
|
||||
- strings and blobs: the embedded integer is the number of bytes that follow with the data.
|
||||
- reference: value is the absolute offset referred to
|
||||
|
||||
So the string "hello" would be: `0x55 'h' 'e' 'l' 'l' 'o'`.
|
||||
Integer `8 : u64` is `0x28`, integer `39 : u64` is `0x2c 39`.
|
||||
a 64kiB (=0x10000 bytes) png blob would be `0x6e 0x00 0x01 0x00 0x00 <the png bytes>`.
|
||||
|
||||
|
||||
|
||||
5
src/minidag/dune
Normal file
5
src/minidag/dune
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
(library
|
||||
(name sidekick_minidag)
|
||||
(public_name sidekick.minidag)
|
||||
(synopsis "Minidag binary serialization format for proof traces")
|
||||
(libraries))
|
||||
141
src/minidag/encode.ml
Normal file
141
src/minidag/encode.ml
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
class type output = object
|
||||
method write : bytes -> int -> int -> unit
|
||||
end
|
||||
|
||||
type t = {
|
||||
mutable buf: bytes;
|
||||
mutable len: int;
|
||||
out: output;
|
||||
mutable cur_offset: int; (** How many bytes written into [out] already? *)
|
||||
}
|
||||
(** Encoder *)
|
||||
|
||||
let buf_size = 1024
|
||||
let high_watermark = buf_size - 32
|
||||
|
||||
let create ~out () : t =
|
||||
let out = (out :> output) in
|
||||
{ out; cur_offset = 0; buf = Bytes.create buf_size; len = 0 }
|
||||
|
||||
let[@inline never] flush_ self : unit =
|
||||
self.out#write self.buf 0 self.len;
|
||||
self.cur_offset <- self.cur_offset + self.len;
|
||||
self.len <- 0
|
||||
|
||||
let[@inline] abs_offset_ (self : t) : int = self.cur_offset + self.len
|
||||
let flush self : unit = if self.len > 0 then flush_ self
|
||||
|
||||
type node_encoder = t
|
||||
type offset = int
|
||||
|
||||
let maybe_flush self = if self.len >= high_watermark then flush_ self
|
||||
let[@inline] buf_len self = Bytes.length self.buf
|
||||
|
||||
let[@inline never] ensure_slow_ (self : t) n =
|
||||
(* Flush before growing: peak memory stays bounded at buf_size + n rather
|
||||
than the full accumulated output size. An alternative would be to grow
|
||||
without flushing, but that would buffer the entire output in RAM. *)
|
||||
if self.len + n >= high_watermark then flush self;
|
||||
let cap = ref (buf_len self) in
|
||||
while !cap < self.len + n do
|
||||
cap := !cap + (!cap / 2)
|
||||
done;
|
||||
if !cap > buf_len self then (
|
||||
let newbuf = Bytes.create !cap in
|
||||
Bytes.blit self.buf 0 newbuf 0 self.len;
|
||||
self.buf <- newbuf
|
||||
)
|
||||
|
||||
(** Ensure at least [n] bytes of free space *)
|
||||
let ensure_ self n : unit =
|
||||
let cap = buf_len self in
|
||||
if cap < self.len + n then ensure_slow_ self n;
|
||||
assert (self.len + n <= buf_len self)
|
||||
|
||||
let write_leading (self : t) ~high ~low =
|
||||
if low >= 12 then ensure_ self 9;
|
||||
Bytes.set self.buf self.len (Char.unsafe_chr ((high lsl 4) lor low));
|
||||
self.len <- self.len + 1
|
||||
|
||||
let stop self = write_leading self ~high:0 ~low:0
|
||||
let null self = write_leading self ~high:1 ~low:0
|
||||
|
||||
let bool self b =
|
||||
write_leading self ~high:1
|
||||
~low:
|
||||
(if b then
|
||||
1
|
||||
else
|
||||
2)
|
||||
|
||||
let uint64_ self ~high i =
|
||||
assert (i >= 0L);
|
||||
if i < 12L then
|
||||
write_leading self ~high ~low:(Int64.to_int i)
|
||||
else if i < Int64.shift_left 1L 8 then (
|
||||
write_leading self ~high ~low:12;
|
||||
Bytes.set_int8 self.buf self.len (Int64.to_int i);
|
||||
self.len <- self.len + 1
|
||||
) else if i < Int64.shift_left 1L 16 then (
|
||||
write_leading self ~high ~low:13;
|
||||
Bytes.set_int16_le self.buf self.len (Int64.to_int i);
|
||||
self.len <- self.len + 2
|
||||
) else if i < Int64.shift_left 1L 32 then (
|
||||
write_leading self ~high ~low:14;
|
||||
Bytes.set_int32_le self.buf self.len (Int64.to_int32 i);
|
||||
self.len <- self.len + 4
|
||||
) else (
|
||||
write_leading self ~high ~low:15;
|
||||
Bytes.set_int64_le self.buf self.len i;
|
||||
self.len <- self.len + 8
|
||||
)
|
||||
|
||||
let uint_ self ~high i =
|
||||
if i < 12 then
|
||||
write_leading self ~high ~low:i
|
||||
else
|
||||
uint64_ self ~high (Int64.of_int i)
|
||||
|
||||
let int64 self i =
|
||||
if i = Int64.min_int then
|
||||
failwith "Encode.int64: Int64.min_int is not representable"
|
||||
else if i < 0L then
|
||||
uint64_ self ~high:3 (Int64.neg i)
|
||||
else
|
||||
uint64_ self ~high:2 i
|
||||
|
||||
let int self i =
|
||||
if i = min_int then
|
||||
uint64_ self ~high:3 Int64.(abs (of_int i))
|
||||
else if i < 0 then
|
||||
uint_ self ~high:3 (abs i)
|
||||
else
|
||||
uint_ self ~high:2 i
|
||||
|
||||
let float self f =
|
||||
(* always 64-bit; bits_of_float is a signed int64 so bypass uint64_ *)
|
||||
write_leading self ~high:4 ~low:15;
|
||||
Bytes.set_int64_le self.buf self.len (Int64.bits_of_float f);
|
||||
self.len <- self.len + 8
|
||||
|
||||
let string_ self ~high s =
|
||||
let len = String.length s in
|
||||
uint_ self ~high len;
|
||||
ensure_ self len;
|
||||
Bytes.blit_string s 0 self.buf self.len len;
|
||||
self.len <- self.len + len
|
||||
|
||||
let string self s = string_ self ~high:5 s
|
||||
let blob self s = string_ self ~high:6 s
|
||||
|
||||
let ref self i =
|
||||
assert (i < abs_offset_ self);
|
||||
uint_ self ~high:7 i
|
||||
|
||||
let write_node (self : t) cmd (f : node_encoder -> unit) : offset =
|
||||
let offset = abs_offset_ self in
|
||||
string self cmd;
|
||||
f self;
|
||||
stop self;
|
||||
maybe_flush self;
|
||||
offset
|
||||
30
src/minidag/encode.mli
Normal file
30
src/minidag/encode.mli
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
type t
|
||||
(** Encoder *)
|
||||
|
||||
class type output = object
|
||||
method write : bytes -> int -> int -> unit
|
||||
end
|
||||
|
||||
val create : out:#output -> unit -> t
|
||||
|
||||
val flush : t -> unit
|
||||
(** Write all data to the internal output *)
|
||||
|
||||
type node_encoder
|
||||
(** Local state to write the data for a node *)
|
||||
|
||||
type offset = private int
|
||||
|
||||
val write_node : t -> string -> (node_encoder -> unit) -> offset
|
||||
(** [write_node enc command f] starts a new node with "command", calls [f] to
|
||||
add the arguments, and returns the offset of this new node. The node encoder
|
||||
must not escape. *)
|
||||
|
||||
val null : node_encoder -> unit
|
||||
val bool : node_encoder -> bool -> unit
|
||||
val int64 : node_encoder -> int64 -> unit
|
||||
val int : node_encoder -> int -> unit
|
||||
val float : node_encoder -> float -> unit
|
||||
val string : node_encoder -> string -> unit
|
||||
val blob : node_encoder -> string -> unit
|
||||
val ref : node_encoder -> offset -> unit
|
||||
Loading…
Add table
Reference in a new issue