diff --git a/benchs/dune b/benchs/dune index f0d36ef7..666f57f6 100644 --- a/benchs/dune +++ b/benchs/dune @@ -10,7 +10,6 @@ qcheck oseq batteries - base sek) (flags :standard -warn-error -3-5 -w -60 -safe-string -color always) (optional) diff --git a/benchs/run_benchs.ml b/benchs/run_benchs.ml index db23a3c4..4d8357cc 100644 --- a/benchs/run_benchs.ml +++ b/benchs/run_benchs.ml @@ -1535,7 +1535,7 @@ module Str = struct let rand_str_ ?(among = "abcdefgh") n = let module Q = QCheck in let st = Random.State.make [| n + 17 |] in - let gen_c = QCheck.Gen.oneofl (CCString.to_list among) in + let gen_c = QCheck.Gen.oneof_list (CCString.to_list among) in QCheck.Gen.string_size ~gen:gen_c (QCheck.Gen.return n) st let find ?(start = 0) ~sub s = @@ -1824,4 +1824,32 @@ module Str = struct ]) end +module Hash = struct + let hash_ocaml (n : int) : int = + let offset_basis = 0xcbf29ce484222325L in + let prime = 0x100000001b3L in + let h = ref offset_basis in + for k = 0 to 7 do + (h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff)))); + h := Int64.(mul !h prime) + done; + Int64.to_int !h land max_int + + let bench_hash n = + let run_ocaml () = + for i = 0 to n - 1 do + opaque_ignore (hash_ocaml i) + done + and run_c_stub () = + for i = 0 to n - 1 do + opaque_ignore (CCInt.hash i) + done + in + B.throughputN 3 ~repeat + [ "ocaml_fnv", run_ocaml, (); "c_stub", run_c_stub, () ] + + let () = + B.Tree.register ("hash" @>>> [ "int" @>> app_ints bench_hash [ 1_000 ] ]) +end + let () = try B.Tree.run_global () with Arg.Help msg -> print_endline msg diff --git a/src/core/CCHash.ml b/src/core/CCHash.ml index b4fdc384..f7f7a12f 100644 --- a/src/core/CCHash.ml +++ b/src/core/CCHash.ml @@ -7,31 +7,33 @@ type 'a t = 'a -> hash type 'a iter = ('a -> unit) -> unit type 'a gen = unit -> 'a option -(* FNV hashing +(* same as CCInt: *) +open struct + external hash_int_ : (int[@untagged]) -> (int[@untagged]) + = "caml_cc_hash_int_byte" "caml_cc_hash_int" + [@@noalloc] + + external hash64_ : (int64[@unboxed]) -> (int[@untagged]) + = "caml_cc_hash_int64_byte" "caml_cc_hash_int64" + [@@noalloc] + + (* FNV-1a hashing (XOR then multiply ) https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *) -let fnv_offset_basis = 0xcbf29ce484222325L -let fnv_prime = 0x100000001b3L - -(* hash an integer *) -let hash_int_ n = - let h = ref fnv_offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff))) - done; - (* truncate back to int and remove sign *) - Int64.to_int !h land max_int + let fnv_offset_basis = 0xcbf29ce484222325L + let fnv_prime = 0x100000001b3L +end +(* TODO: also port to C *) let combine2 a b = let h = ref fnv_offset_basis in (* we only do one loop, where we mix bytes of [a] and [b], so as to simplify control flow *) for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); (h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff)))); (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff))) + (h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff)))); + h := Int64.(mul !h fnv_prime) done; Int64.to_int !h land max_int @@ -42,26 +44,26 @@ let combine3 a b c = (* we only do one loop, where we mix bytes of [a] [b] and [c], so as to simplify control flow *) for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); (h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff)))); (h := Int64.(mul !h fnv_prime)); (h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff)))); (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff))) + (h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff)))); + h := Int64.(mul !h fnv_prime) done; Int64.to_int !h land max_int let combine4 a b c d = let h = ref fnv_offset_basis in for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); (h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff)))); (h := Int64.(mul !h fnv_prime)); (h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff)))); (h := Int64.(mul !h fnv_prime)); (h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff)))); (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((d lsr (k * 8)) land 0xff))) + (h := Int64.(logxor !h (of_int ((d lsr (k * 8)) land 0xff)))); + h := Int64.(mul !h fnv_prime) done; Int64.to_int !h land max_int @@ -72,29 +74,19 @@ let combine6 a b c d e f = combine4 a b c (combine3 d e f) let const h _ = h let const0 _ = 0 -let int = hash_int_ +let[@inline] int i = hash_int_ i -let bool b = +let[@inline] bool b = hash_int_ (if b then 1 else 2) -let char x = hash_int_ (Char.code x) - -(* hash an integer *) -let int64 n : int = - let h = ref fnv_offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (logand (shift_right_logical n (k * 8)) 0xffL)) - done; - (* truncate back to int and remove sign *) - Int64.to_int !h land max_int - -let int32 (x : int32) = int64 (Int64.of_int32 x) -let nativeint (x : nativeint) = int64 (Int64.of_nativeint x) +let[@inline] char x = hash_int_ (Char.code x) +let int64 = hash64_ +let[@inline] int32 (x : int32) = int64 (Int64.of_int32 x) +let[@inline] nativeint (x : nativeint) = int64 (Int64.of_nativeint x) (* do not hash more than 128 bytes in strings/bytes *) let max_len_b_ = 128 @@ -102,9 +94,9 @@ let max_len_b_ = 128 let bytes (x : bytes) = let h = ref fnv_offset_basis in for i = 0 to min max_len_b_ (Bytes.length x - 1) do - (h := Int64.(mul !h fnv_prime)); let byte = Char.code (Bytes.unsafe_get x i) in - h := Int64.(logxor !h (of_int byte)) + (h := Int64.(logxor !h (of_int byte))); + h := Int64.(mul !h fnv_prime) done; Int64.to_int !h land max_int diff --git a/src/core/CCInt.ml b/src/core/CCInt.ml index 801b0857..c9a6115e 100644 --- a/src/core/CCInt.ml +++ b/src/core/CCInt.ml @@ -4,20 +4,11 @@ include Int type 'a iter = ('a -> unit) -> unit -(* use FNV: +(* use FNV-1: https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *) -let hash (n : int) : int = - let offset_basis = 0xcbf29ce484222325L in - let prime = 0x100000001b3L in - - let h = ref offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h prime)); - (* h := h xor (k-th byte of n) *) - h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff))) - done; - (* truncate back to int and remove sign *) - Int64.to_int !h land max_int +external hash : (int[@untagged]) -> (int[@untagged]) + = "caml_cc_hash_int_byte" "caml_cc_hash_int" +[@@noalloc] let range i j yield = let rec up i j yield = diff --git a/src/core/CCInt64.ml b/src/core/CCInt64.ml index 8bb45ead..f70e48bc 100644 --- a/src/core/CCInt64.ml +++ b/src/core/CCInt64.ml @@ -11,21 +11,12 @@ let max : t -> t -> t = Stdlib.max let sign i = compare i zero -(* use FNV: - https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *) -let hash_to_int64 (n : t) = - let offset_basis = 0xcbf29ce484222325L in - let prime = 0x100000001b3L in +external hash_to_int64 : (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_hash_int64_to_int64_byte" "caml_cc_hash_int64_to_int64" - let h = ref offset_basis in - for k = 0 to 7 do - h := mul !h prime; - (* h := h xor (k-th byte of n) *) - h := logxor !h (logand (shift_right n (k * 8)) 0xffL) - done; - logand !h max_int - -let[@inline] hash (n : t) : int = to_int (hash_to_int64 n) land Stdlib.max_int +external hash : (int64[@unboxed]) -> (int[@untagged]) + = "caml_cc_hash_int64_byte" "caml_cc_hash_int64" +[@@noalloc] (* see {!CCInt.popcount} for more details *) let[@inline] popcount (b : t) : int = diff --git a/src/core/cc_stubs.c b/src/core/cc_stubs.c new file mode 100644 index 00000000..cebd8b14 --- /dev/null +++ b/src/core/cc_stubs.c @@ -0,0 +1,54 @@ +#include +#include +#include + +/* FNV-1a hash for a 64-bit integer. + https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function */ + +static inline int64_t cc_fnv_hash_int64(int64_t n) { + uint64_t un = (uint64_t)n; + uint64_t h = UINT64_C(0xcbf29ce484222325); + const uint64_t prime = UINT64_C(0x100000001b3); + for (int k = 0; k < 8; k++) { + h ^= (un >> (k * 8)) & 0xff; + h *= prime; + } + return (int64_t)h; +} + +/* Mask to the OCaml int range (63 bits on 64-bit, 31 on 32-bit) + before hashing, so negative OCaml ints hash the same as + the unsigned representation seen by OCaml's [lsr]. */ +#define OCAML_INT_MASK ((UINT64_C(1) << (8 * sizeof(value) - 1)) - 1) + +/* native: untagged int in, untagged int out */ +CAMLprim intnat caml_cc_hash_int(intnat n) { + int64_t projected = (int64_t)((uint64_t)n & OCAML_INT_MASK); + return (intnat)((uint64_t)cc_fnv_hash_int64(projected) & Max_long); +} + +/* bytecode: boxed value in, boxed value out */ +CAMLprim value caml_cc_hash_int_byte(value v_n) { + return Val_long(caml_cc_hash_int(Long_val(v_n))); +} + +/* native: unboxed int64 in, untagged int out */ +CAMLprim intnat caml_cc_hash_int64(int64_t n) { + return (intnat)((uint64_t)cc_fnv_hash_int64(n) & Max_long); +} + +/* bytecode: boxed int64 value in, boxed value out */ +CAMLprim value caml_cc_hash_int64_byte(value v_n) { + return Val_long(caml_cc_hash_int64(Int64_val(v_n))); +} + +/* native: unboxed int64 in, unboxed int64 out. + Masks to non-negative int64 (matches OCaml's Int64.max_int). */ +CAMLprim int64_t caml_cc_hash_int64_to_int64(int64_t n) { + return cc_fnv_hash_int64(n) & INT64_MAX; +} + +/* bytecode: boxed int64 in, boxed int64 out */ +CAMLprim value caml_cc_hash_int64_to_int64_byte(value v_n) { + return caml_copy_int64(cc_fnv_hash_int64(Int64_val(v_n)) & INT64_MAX); +} diff --git a/src/core/dune b/src/core/dune index bd362200..86872496 100644 --- a/src/core/dune +++ b/src/core/dune @@ -6,6 +6,10 @@ (action (run %{project_root}/src/core/cpp/cpp.exe %{input-file}))) (flags :standard -nolabels -open CCMonomorphic) + (foreign_stubs + (language c) + (flags :standard -std=c99 -O2) + (names cc_stubs)) (libraries either containers.monomorphic containers.domain)) (ocamllex diff --git a/tests/core/t_hash.ml b/tests/core/t_hash.ml index 37b1d6d1..54eeb050 100644 --- a/tests/core/t_hash.ml +++ b/tests/core/t_hash.ml @@ -1,6 +1,19 @@ open CCHash module T = (val Containers_testlib.make ~__FILE__ ()) -include T;; +include T + +open struct + let hash_ocaml64 (n : int64) : int = + let offset_basis = 0xcbf29ce484222325L in + let prime = 0x100000001b3L in + let h = ref offset_basis in + for k = 0 to 7 do + (h := Int64.(logxor !h (Int64.logand (Int64.shift_left n (k * 8)) 0xffL))); + h := Int64.(mul !h prime) + done; + Int64.to_int !h land max_int +end +;; t @@ fun () -> int 42 >= 0;; t @@ fun () -> int max_int >= 0;; @@ -17,3 +30,7 @@ t @@ fun () -> string "abc" <> string "abcd";; q Q.int (fun i -> Q.assume (i >= 0); int i = int64 (Int64.of_int i)) +;; + +q Q.int64 + Q.(fun i -> Int64.compare i 0L >= 0 ==> (CCInt64.hash i = hash_ocaml64 i)) diff --git a/tests/core/t_int64.ml b/tests/core/t_int64.ml index 72078a15..c958520c 100644 --- a/tests/core/t_int64.ml +++ b/tests/core/t_int64.ml @@ -101,3 +101,42 @@ eq' 63 (popcount max_int);; eq' 1 (popcount min_int);; eq' 10 (popcount 0b1110010110110001010L);; eq' 5 (popcount 0b1101110000000000L) + +(* hash tests *) +let ( >= ) = Stdlib.( >= ) +let ( = ) = Stdlib.( = ) +let ( <> ) = Stdlib.( <> );; + +(* hash is non-negative *) +t @@ fun () -> hash 0L >= 0;; +t @@ fun () -> hash 1L >= 0;; +t @@ fun () -> hash (-1L) >= 0;; +t @@ fun () -> hash max_int >= 0;; +t @@ fun () -> hash min_int >= 0;; + +(* hash_to_int64 is non-negative *) +t @@ fun () -> CCInt64.compare (hash_to_int64 0L) 0L >= 0;; +t @@ fun () -> CCInt64.compare (hash_to_int64 (-1L)) 0L >= 0;; +t @@ fun () -> CCInt64.compare (hash_to_int64 min_int) 0L >= 0;; + +(* hash is consistent with hash_to_int64 *) +t @@ fun () -> hash 42L = Stdlib.(Int64.to_int (hash_to_int64 42L) land max_int) +;; + +t @@ fun () -> +hash (-1L) = Stdlib.(Int64.to_int (hash_to_int64 (-1L)) land max_int) +;; + +(* different inputs produce different hashes *) +t @@ fun () -> hash 0L <> hash 1L;; +t @@ fun () -> hash 1L <> hash 2L;; +t @@ fun () -> hash 1L <> hash (-1L);; +t @@ fun () -> hash_to_int64 0L <> hash_to_int64 1L;; + +(* deterministic *) +t @@ fun () -> hash 123L = hash 123L;; +t @@ fun () -> hash_to_int64 123L = hash_to_int64 123L;; + +(* quickcheck: hash is always non-negative *) +q Q.(map Int64.of_int int) (fun n -> hash n >= 0);; +q Q.(map Int64.of_int int) (fun n -> CCInt64.compare (hash_to_int64 n) 0L >= 0)