From 0ff96145207725198b5a2d294067327cbd899517 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Tue, 6 May 2025 21:53:26 -0400 Subject: [PATCH] feat: add `containers.leb128` library adapted from pbrt --- src/leb128/containers_leb128.ml | 98 ++++++++++++++++++++++++++++++++ src/leb128/containers_leb128.mli | 49 ++++++++++++++++ src/leb128/dune | 11 ++++ src/leb128/stubs.c | 73 ++++++++++++++++++++++++ 4 files changed, 231 insertions(+) create mode 100644 src/leb128/containers_leb128.ml create mode 100644 src/leb128/containers_leb128.mli create mode 100644 src/leb128/dune create mode 100644 src/leb128/stubs.c diff --git a/src/leb128/containers_leb128.ml b/src/leb128/containers_leb128.ml new file mode 100644 index 00000000..f250a5eb --- /dev/null +++ b/src/leb128/containers_leb128.ml @@ -0,0 +1,98 @@ +(* adapted from ocaml-protoc from code by c-cube *) + +module Byte_slice = CCByte_slice +module Byte_buffer = CCByte_buffer + +module Decode = struct + let skip (sl : Byte_slice.t) off : int = + let shift = ref 0 in + let continue = ref true in + + let off = ref off in + let n_consumed = ref 0 in + + while !continue do + if sl.len <= 0 then invalid_arg "out of bound"; + incr n_consumed; + let b = Char.code (Bytes.get sl.bs !off) in + let cur = b land 0x7f in + if cur <> b then ( + (* at least one byte follows this one *) + incr off; + shift := !shift + 7 + ) else if !shift < 63 || b land 0x7f <= 1 then + continue := false + else + invalid_arg "leb128 varint is too long" + done; + + !n_consumed + + let u64 (sl : Byte_slice.t) (off : int) : int64 * int = + let shift = ref 0 in + let res = ref 0L in + let continue = ref true in + + let off = ref off in + let n_consumed = ref 0 in + + while !continue do + if sl.len <= 0 then invalid_arg "out of bound"; + incr n_consumed; + let b = Char.code (Bytes.get sl.bs !off) in + let cur = b land 0x7f in + if cur <> b then ( + (* at least one byte follows this one *) + (res := Int64.(logor !res (shift_left (of_int cur) !shift))); + incr off; + shift := !shift + 7 + ) else if !shift < 63 || b land 0x7f <= 1 then ( + (res := Int64.(logor !res (shift_left (of_int b) !shift))); + continue := false + ) else + invalid_arg "leb128 varint is too long" + done; + + !res, !n_consumed + + let[@inline] uint_truncate sl off = + let v, n_consumed = u64 sl off in + Int64.to_int v, n_consumed + + let[@inline] decode_zigzag (v : int64) : int64 = + Int64.(logxor (shift_right v 1) (neg (logand v Int64.one))) + + let[@inline] i64 sl off : int64 * int = + let v, n_consumed = u64 sl off in + decode_zigzag v, n_consumed + + let[@inline] int_truncate sl off = + let v, n_consumed = u64 sl off in + Int64.to_int (decode_zigzag v), n_consumed +end + +module Encode = struct + let[@inline] encode_zigzag (i : int64) : int64 = + Int64.(logxor (shift_left i 1) (shift_right i 63)) + + external varint_size : (int64[@unboxed]) -> int + = "caml_cc_leb128_varint_size_byte" "caml_cc_leb128_varint_size" + [@@noalloc] + (** Compute how many bytes this int would occupy as varint *) + + external varint_slice : bytes -> (int[@untagged]) -> (int64[@unboxed]) -> unit + = "caml_cc_leb128_varint_byte" "caml_cc_leb128_varint" + [@@noalloc] + (** Write this int as varint into the given slice *) + + let[@inline] u64 (buf : Byte_buffer.t) (i : int64) = + let n = varint_size i in + Byte_buffer.ensure_free buf n; + assert (buf.len + n <= Bytes.length buf.bs); + varint_slice buf.bs buf.len i; + buf.len <- buf.len + n + + let[@inline] i64 buf i : unit = u64 buf (encode_zigzag i) + let[@inline] uint buf i : unit = u64 buf (Int64.of_int i) + let[@inline] int buf i : unit = u64 buf (encode_zigzag (Int64.of_int i)) +end diff --git a/src/leb128/containers_leb128.mli b/src/leb128/containers_leb128.mli new file mode 100644 index 00000000..13136ba1 --- /dev/null +++ b/src/leb128/containers_leb128.mli @@ -0,0 +1,49 @@ +(** LEB128 encoding and decoding. + + See https://en.wikipedia.org/wiki/LEB128 . *) + +module Byte_slice = CCByte_slice +module Byte_buffer = CCByte_buffer + +module Decode : sig + val decode_zigzag : int64 -> int64 + (** Turn an unsigned integer into a signed one. + + See https://en.wikipedia.org/wiki/Variable-length_quantity#Zigzag_encoding + *) + + val skip : Byte_slice.t -> int -> int + (** [skip slice off] reads an integer at offset [off], and returns how many + bytes the integer occupies. *) + + val u64 : Byte_slice.t -> int -> int64 * int + (** [u64 slice off] reads an integer at offset [off], and returns a pair + [v, n_consumed]. [v] is the read integer, [n_consumed] is the number of + bytes consumed during reading. *) + + val i64 : Byte_slice.t -> int -> int64 * int + (** Read a signed int64 by reading a u64 and zigzag decoding it *) + + val int_truncate : Byte_slice.t -> int -> int * int + (** Like {!i64} but truncates to integer. Returns a pair [v, n_consumed]. *) + + val uint_truncate : Byte_slice.t -> int -> int * int + (** Like {!u64} but truncates to integer. *) +end + +module Encode : sig + val encode_zigzag : int64 -> int64 + (** Turn a signed int64 into a u64 via zigzag encoding. *) + + val u64 : Byte_buffer.t -> int64 -> unit + (** Write a unsigned int *) + + val i64 : Byte_buffer.t -> int64 -> unit + (** Write a signed int via zigzag encoding *) + + val uint : Byte_buffer.t -> int -> unit + (** Turn an uint into a u64 and write it *) + + val int : Byte_buffer.t -> int -> unit + (** Turn an int into a int64 and write it *) +end diff --git a/src/leb128/dune b/src/leb128/dune new file mode 100644 index 00000000..f9e45684 --- /dev/null +++ b/src/leb128/dune @@ -0,0 +1,11 @@ +(library + (name containers_leb128) + (public_name containers.leb128) + (synopsis + "LEB128 encoding (https://en.wikipedia.org/wiki/LEB128) for cephalopod") + (libraries containers) + (foreign_stubs + (language c) + (flags :standard -std=c99 -O2) + (names stubs)) + (ocamlopt_flags :standard -inline 100)) diff --git a/src/leb128/stubs.c b/src/leb128/stubs.c new file mode 100644 index 00000000..a740d9de --- /dev/null +++ b/src/leb128/stubs.c @@ -0,0 +1,73 @@ + +// readapted from ocaml-protoc, original code also from c-cube + +#include +#include +#include +#include +#include + +static inline int ix_leb128_varint_size(uint64_t i) { +/* generated with: +for i in range(1,10): + ceiling = (1 << (i*7))-1 + print(f'if (i <= {ceiling}L) return {i};') +*/ + + if (i <= 127L) return 1; + if (i <= 16383L) return 2; + if (i <= 2097151L) return 3; + if (i <= 268435455L) return 4; + if (i <= 34359738367L) return 5; + if (i <= 4398046511103L) return 6; + if (i <= 562949953421311L) return 7; + if (i <= 72057594037927935L) return 8; + if (i <= 9223372036854775807L) return 9; + return 10; +} + +// number of bytes for i +CAMLprim value caml_cc_leb128_varint_size(int64_t i) { + int res = ix_leb128_varint_size(i); + return Val_int(res); +} + +// boxed version, for bytecode +CAMLprim value caml_cc_leb128_varint_size_byte(value v_i) { + CAMLparam1(v_i); + + int64_t i = Int64_val(v_i); + int res = ix_leb128_varint_size(i); + CAMLreturn(Val_int(res)); +} + +// write i at str[idx…] in varint +static inline void ix_leb128_varint(unsigned char *str, uint64_t i) { + while (true) { + uint64_t cur = i & 0x7f; + if (cur == i) { + *str = (unsigned char)cur; + break; + } else { + *str = (unsigned char)(cur | 0x80); + i = i >> 7; + ++str; + } + } +} + +// write `i` starting at `idx` +CAMLprim value caml_cc_leb128_varint(value _str, intnat idx, int64_t i) { + char *str = Bytes_val(_str); + ix_leb128_varint(str + idx, i); + return Val_unit; +} + +CAMLprim value caml_cc_leb128_varint_byte(value _str, value _idx, value _i) { + CAMLparam3(_str, _idx, _i); + char *str = Bytes_val(_str); + int idx = Int_val(_idx); + int64_t i = Int64_val(_i); + ix_leb128_varint(str + idx, i); + CAMLreturn(Val_unit); +}