refactor(flat_tbl): use only one array of slots + inline record

This commit is contained in:
Simon Cruanes 2021-04-03 00:48:38 -04:00
parent bee90ee6ea
commit 76a3fb953e

View file

@ -71,120 +71,91 @@ module Make(H : Hashtbl.HashedType) = struct
additional pointer anyway. *) additional pointer anyway. *)
type 'a slot = type 'a slot =
| Empty | Empty
| Used of key * 'a | Used of {
k: key;
mutable v: 'a;
hash: int;
mutable dib: int; (* DIB: distance to initial bucket *)
}
let max_load = 0.8 let max_load = 0.92
let probe_dist_n_bits = 7 (* store probe distance on <n> bits *)
type 'a t = { type 'a t = {
mutable meta: int array;
(* [hash | probe_distance[0..10] | present[1]]
for key at index [i] *)
mutable slots: 'a slot array; (* slot for index [i] *) mutable slots: 'a slot array; (* slot for index [i] *)
mutable size : int; mutable size : int;
(* TODO: [max_dist: int], so we can stop loopup early? *) mutable max_dib : int;
} }
let create size : _ t = let create size : _ t =
let size = max 8 size in let size = max 8 size in
{ slots = Array.make size Empty; { slots = Array.make size Empty;
meta = Array.make size 0;
size = 0; size = 0;
max_dib = 0;
} }
let copy self = let copy self =
{ slots = Array.copy self.slots; { slots = Array.copy self.slots;
meta = Array.copy self.meta;
size = self.size; size = self.size;
max_dib = self.max_dib;
} }
(** clear the table, by resetting all states to Empty *) (** clear the table, by resetting all states to Empty *)
let clear self = let clear self =
let {slots; meta; size=_} = self in let {slots; max_dib=_; size=_} = self in
Array.fill slots 0 (Array.length slots) Empty; Array.fill slots 0 (Array.length slots) Empty;
Array.fill meta 0 (Array.length meta ) 0; self.max_dib <- 0;
self.size <- 0 self.size <- 0
(* Index of slot, for i-th probing starting from hash [h] in (* Index of slot, for i-th probing starting from hash [h] in
a table of length [n] *) a table of length [n].
let[@inline] addr_ h n dist = (h + dist) mod n Note: we make sure the [h+dist] part is positive first,
and we do not use [abs] since it can be negative on [min_int]. *)
(* normalize h by removing bits that will not fit in storage *) let[@inline] addr_ h n dist =
let[@inline] normalize_hash_ h : int = ((h + dist) land max_int) mod n
(h lsl (1+probe_dist_n_bits)) lsr (1+probe_dist_n_bits)
(** [mk_meta_ hash dist] make new metadata *)
let mk_meta_ h dist : int =
let dist_mask = (1 lsl probe_dist_n_bits)-1 in
let dist = dist land dist_mask in
(* LSB=1 to indicate presence *)
(((h lsl probe_dist_n_bits) lor dist) lsl 1) lor 1
(* hash of metadata (truncated) *)
let[@inline] hash_of_meta_ m : int =
m lsr (probe_dist_n_bits+1)
(* probe distance of metadata (truncated) *)
let[@inline] dist_of_meta_ m : int =
(m lsr 1) land ((1 lsl probe_dist_n_bits)-1)
(* presence bit of metadata *)
let[@inline] presence_meta_ m : bool =
(m land 1) == 1
(* Insert [k -> v] in [self], starting with the hash [h]. (* Insert [k -> v] in [self], starting with the hash [h].
Does not modify the size. *) Does not modify the size. *)
let insert_ (self:_ t) h k v : unit = let insert_ (self:_ t) h k v : unit =
let {slots; meta; size=_} = self in let {slots; max_dib=_; size=_} = self in
let n = Array.length slots in let n = Array.length slots in
assert (n=Array.length meta);
(* lookup an empty slot to insert the key->value in. *) (* lookup an empty slot to insert the key->value in. *)
let rec insert_rec_ h k v dist = let rec insert_rec_ h k v dib =
let j = addr_ h n dist in let j = addr_ h n dib in
let m_j = Array.unsafe_get meta j in match Array.unsafe_get slots j with
let dist_j = dist_of_meta_ m_j in | Empty ->
let hash_j = hash_of_meta_ m_j in Array.unsafe_set slots j (Used {k; v; hash=h; dib});
self.max_dib <- max dib self.max_dib
| Used used_j ->
if used_j.hash = h && H.equal k used_j.k then (
(* same key: replace *)
used_j.v <- v;
used_j.dib <- dib;
self.max_dib <- max dib self.max_dib;
if not (presence_meta_ m_j) then ( ) else if used_j.dib < dib then (
(* empty slot *) (* displace element*)
let m = mk_meta_ h dist in let k_j = used_j.k in
meta.(j) <- m; let v_j = used_j.v in
slots.(j) <- Used (k, v); let h_j = used_j.hash in
) else if h <> hash_j && dist_j >= dist then ( let dib_j = used_j.dib in
(* different slot and hash (hence, key): try next slot *)
insert_rec_ h k v (dist+1)
) else (
let k_j, v_j =
match Array.unsafe_get slots j with
| Empty -> assert false
| Used (k,v) -> k, v
in
if H.equal k k_j then ( Array.unsafe_set slots j (Used {k;v;hash=h;dib});
(* replace slot, same key *) self.max_dib <- max dib self.max_dib;
slots.(j) <- Used (k, v);
) else if dist_j < dist then (
(* displace this element *)
let m = mk_meta_ h dist in insert_rec_ h_j k_j v_j dib_j
meta.(j) <- m;
slots.(j) <- Used (k, v);
insert_rec_ hash_j k_j v_j dist_j
) else ( ) else (
(* try next slot *) (* look further *)
insert_rec_ h k v (dist+1) insert_rec_ h k v (dib+1)
) )
)
in in
insert_rec_ h k v 0 insert_rec_ h k v 0
(* Resize the array, by inserting its content into twice as large an array *) (* Resize the array, by inserting its content into twice as large an array *)
let resize (self:_ t) : unit = let resize (self:_ t) : unit =
let {slots=old_slots; meta=old_meta; size=_} = self in let {slots=old_slots; max_dib=_; size=_} = self in
let new_size = let new_size =
let n = Array.length old_slots in let n = Array.length old_slots in
@ -194,61 +165,56 @@ module Make(H : Hashtbl.HashedType) = struct
if new_size <= Array.length old_slots then failwith "flat_tbl: cannot resize further"; if new_size <= Array.length old_slots then failwith "flat_tbl: cannot resize further";
self.slots <- Array.make new_size Empty; self.slots <- Array.make new_size Empty;
self.meta <- Array.make new_size 0; self.max_dib <- 0;
(* insert elements into new table *) (* insert elements into new table *)
Array.iteri Array.iter
(fun i slot -> match slot with (fun slot -> match slot with
| Empty -> () | Empty -> ()
| Used (k,v) -> | Used {k; v; hash; _} ->
let m = Array.unsafe_get old_meta i in insert_ self hash k v)
let h = hash_of_meta_ m in
insert_ self h k v)
old_slots; old_slots;
() ()
(* Lookup [key] in the table *) (* Lookup [key] in the table *)
let find_opt self k = let find_opt self k =
let {slots; meta; size=_} = self in let {slots; max_dib; size=_} = self in
let n = Array.length slots in let n = Array.length slots in
let h = normalize_hash_ (H.hash k) in let h = H.hash k in
let slots = self.slots in
let[@unroll 2] rec find_rec_ dist =
assert (dist < n); (* load factor would be 1 *)
let j = addr_ h n dist in
let m_j = Array.unsafe_get meta j in let rec find_rec_ dib =
if not (presence_meta_ m_j) then ( (*assert (dist < n); (* load factor would be 1 *)*)
None (* met empty slot *) let j = addr_ h n dib in
) else ( match Array.unsafe_get slots j with
| Empty -> None
(* TODO: if we store max_probe_dist, use this for early | Used used_j ->
termination if used_j.hash = h && H.equal used_j.k k then (
let dist_j = dist_of_meta_ m_j in Some used_j.v (* found *)
if dist_j > max_probe_dist then raise Not_found ) else if dib >= max_dib then (
*) None (* no need to go further *)
let h_j = hash_of_meta_ m_j in
if h <> h_j then (
(* different hash *)
find_rec_ (dist+1)
) else ( ) else (
(* unroll by hand *)
let dib = dib+1 in
let j = addr_ h n dib in
match Array.unsafe_get slots j with match Array.unsafe_get slots j with
| Used (k2, v) -> | Empty -> None
if H.equal k k2 then Some v | Used used_j ->
else ( if used_j.hash = h && H.equal used_j.k k then (
(* different key *) Some used_j.v (* found *)
find_rec_ (dist+1) ) else if dib >= max_dib then (
None (* no need to go further *)
) else (
find_rec_ (dib+1)
) )
| Empty -> assert false
) )
)
in in
(* try a direct hit first *) (* try a direct hit first *)
begin match Array.unsafe_get slots (addr_ h n 0) with begin match Array.unsafe_get slots (addr_ h n 0) with
| Empty -> None | Empty -> None
| Used (k2, v) when H.equal k k2 -> Some v | Used {k=k2; v; hash; _} when h = hash && H.equal k k2 -> Some v
| _ -> find_rec_ 1 | _ -> find_rec_ 1
end end
@ -265,7 +231,7 @@ module Make(H : Hashtbl.HashedType) = struct
resize self; resize self;
); );
let h = normalize_hash_ (H.hash k) in let h = H.hash k in
self.size <- 1 + self.size; self.size <- 1 + self.size;
insert_ self h k v insert_ self h k v
@ -273,59 +239,47 @@ module Make(H : Hashtbl.HashedType) = struct
(see https://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion/ ) (see https://codecapsule.com/2013/11/17/robin-hood-hashing-backward-shift-deletion/ )
to keep probe_distance low, instead of using tombstones. *) to keep probe_distance low, instead of using tombstones. *)
let remove self k : unit = let remove self k : unit =
let {slots; meta; size=_} = self in let {slots; max_dib=_; size=_} = self in
let n = Array.length slots in let n = Array.length slots in
let h = normalize_hash_ (H.hash k) in let h = H.hash k in
(* given that [i] is empty, and [i_succ = (i+1) mod n], (* given that [i] is empty, and [i_succ = (i+1) mod n],
see if we can shift the element at [i_succ] to the left see if we can shift the element at [i_succ] to the left
to decrease its probe count. *) to decrease its probe count. *)
let rec backward_shift_ i i_succ : unit = let rec backward_shift_ i i_succ : unit =
let m = Array.unsafe_get meta i_succ in match Array.unsafe_get slots i_succ with
if presence_meta_ m then ( | Empty -> ()
let dist = dist_of_meta_ m in
if dist > 0 then (
let slot = Array.unsafe_get slots i_succ in
assert (slot != Empty);
let m = mk_meta_ (hash_of_meta_ m) (dist-1) in | Used used as slot when used.dib > 0 ->
meta.(i) <- m; (* shift to the left, decreasing DIB by one *)
slots.(i) <- slot; Array.unsafe_set slots i slot;
meta.(i_succ) <- 0; (* cleanup i_succ *) used.dib <- used.dib - 1;
slots.(i_succ) <- Empty; Array.unsafe_set slots i_succ Empty;
backward_shift_ i_succ ((i_succ + 1) mod n) backward_shift_ i_succ ((i_succ + 1) mod n)
)
) | Used _ -> ()
in in
let rec find_rec_ dist = let rec find_rec_ dib =
assert (dist<n); assert (dib<n);
let j = addr_ h n dist in let j = addr_ h n dib in
let m_j = Array.unsafe_get meta j in begin match Array.unsafe_get slots j with
let hash_j = hash_of_meta_ m_j in | Empty -> ()
| _ when dib > self.max_dib -> ()
| Used used_j ->
if h = used_j.hash && H.equal k used_j.k then (
(* found element, remove it *)
Array.unsafe_set slots j Empty;
self.size <- self.size - 1;
if not (presence_meta_ m_j) then () (* early exit, key not present *) backward_shift_ j ((j+1) mod n); (* shift slots that come just next *)
else if h <> hash_j then (
find_rec_ (dist+1) (* go further *)
) else (
let k_j = match Array.unsafe_get slots j with
| Empty -> assert false
| Used (k, _) -> k
in
if H.equal k k_j then ( ) else (
(* found element, remove it *) find_rec_ (dib+1)
slots.(j) <- Empty; )
meta.(j) <- 0; end
self.size <- self.size - 1;
backward_shift_ j ((j+1) mod n); (* shift slots that come just next *)
) else (
find_rec_ (dist+1)
)
)
in in
if self.size > 0 then ( if self.size > 0 then (
@ -343,19 +297,18 @@ module Make(H : Hashtbl.HashedType) = struct
(* Iterate on key -> value pairs *) (* Iterate on key -> value pairs *)
let iter f self = let iter f self =
let slots = self.slots in Array.iter
for i = 0 to Array.length slots - 1 do (function
match Array.unsafe_get slots i with | Used {k; v; _} -> f k v
| Used (k, v) -> f k v | Empty -> ())
| _ -> () self.slots
done
(* Fold on key -> value pairs *) (* Fold on key -> value pairs *)
let fold f self acc = let fold f self acc =
Array.fold_left Array.fold_left
(fun acc sl -> match sl with (fun acc sl -> match sl with
| Empty -> acc | Empty -> acc
| Used (k,v) -> f k v acc) | Used {k;v;_} -> f k v acc)
acc self.slots acc self.slots
let to_iter t yield = let to_iter t yield =