new module CCBloom in containers.data, a bloom filter

This commit is contained in:
Simon Cruanes 2015-09-06 21:44:55 +02:00
parent 369a13cea8
commit ecbddc132b
6 changed files with 249 additions and 2 deletions

View file

@ -118,6 +118,7 @@ Documentation [here](http://cedeela.fr/~simon/software/containers).
### Containers.data ### Containers.data
- `CCBitField`, bitfields embedded in integers - `CCBitField`, bitfields embedded in integers
- `CCBloom`, a bloom filter
- `CCCache`, memoization caches, LRU, etc. - `CCCache`, memoization caches, LRU, etc.
- `CCFlatHashtbl`, a flat (open-addressing) hashtable functorial implementation - `CCFlatHashtbl`, a flat (open-addressing) hashtable functorial implementation
- `CCTrie`, a prefix tree - `CCTrie`, a prefix tree

2
_oasis
View file

@ -85,7 +85,7 @@ Library "containers_data"
CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl, CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl,
CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray, CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray,
CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField, CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField,
CCHashTrie CCHashTrie, CCBloom
BuildDepends: bytes BuildDepends: bytes
FindlibParent: containers FindlibParent: containers
FindlibName: data FindlibName: data

View file

@ -64,6 +64,7 @@ Various data structures.
{!modules: {!modules:
CCBitField CCBitField
CCBloom
CCBV CCBV
CCCache CCCache
CCFQueue CCFQueue

166
src/data/CCBloom.ml Normal file
View file

@ -0,0 +1,166 @@
(* This file is free software, part of containers. See file "license" for more details. *)
(** {1 Bloom Filter} *)
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
type 'a hash_funs = ('a -> int) array
let primes_ = [|
2; 3; 5; 7; 11; 13; 17; 19; 23; 29; 31; 37; 41; 43; 47; 53; 59; 61; 67; 71;
73; 79; 83; 89; 97; 101; 103; 107; 109; 113; 127; 131; 137; 139;
149; 151; 157; 163; 167; 173
|]
let default_hash_funs k =
Array.init k
(fun i ->
let seed = if i<Array.length primes_ then primes_.(i) else i in
fun x -> Hashtbl.seeded_hash seed x
)
(** {2 Bloom Filter} *)
type 'a t = {
hash_funs : 'a hash_funs;
arr : Bytes.t;
}
let mk_default_ size =
default_hash_funs (max 2 (size / 20))
let create ?hash size =
if size < 2 then invalid_arg "CCBloom.create";
let hash_funs = match hash with
| None -> mk_default_ size
| Some h -> h
in
let arr = Bytes.make size '\000' in
{ hash_funs; arr }
let create_default ?hash_len size =
let hash = match hash_len with
| None -> mk_default_ size
| Some n -> default_hash_funs n
in
create ~hash size
let copy f =
{f with arr= Bytes.copy f.arr }
let size f = 8 * Bytes.length f.arr
(* number of 1 bits in [c] *)
let rec popcount_byte_ c =
if c=0 then 0
else
(c land 1) + popcount_byte_ (c lsr 1)
let () = assert (
popcount_byte_ 0 = 0 &&
popcount_byte_ 3 = 2 &&
popcount_byte_ 255 = 8
)
(* count the number of 1 bits *)
let rec count_ones_ arr i acc =
if i=Bytes.length arr then acc
else
let c = Char.code (Bytes.get arr i) in
count_ones_ arr (i+1) (acc + popcount_byte_ c)
let load f =
let ones = count_ones_ f.arr 0 0 in
float_of_int ones /. (float_of_int (Bytes.length f.arr * 8))
exception LocalExit
(* get i-th bit *)
let get_ arr i =
let j = i / 8 in
let c = Char.code (Bytes.get arr j) in
c land (1 lsl (i mod 8)) <> 0
(* set i-th bit *)
let set_ arr i =
let j = i / 8 in
let c = Char.code (Bytes.get arr j) in
let c = c lor (1 lsl (i mod 8)) in
Bytes.set arr j (Char.chr c)
let mem f x =
let n = size f in
try
Array.iter
(fun hash -> if not (get_ f.arr (hash x mod n)) then raise LocalExit)
f.hash_funs;
true
with LocalExit -> false
let add f x =
let n = size f in
Array.iter
(fun hash -> set_ f.arr (hash x mod n))
f.hash_funs
(*$Q
Q.(list int) (fun l -> \
let f = create 30 in add_list f l ; \
List.for_all (mem f) l)
*)
let union_mut ~into f =
if size into <> size f then invalid_arg "CCBloom.union_mut";
Bytes.iteri
(fun i c ->
Bytes.set into.arr i
(Char.chr (Char.code (Bytes.get into.arr i) lor (Char.code c)))
) f.arr
let union a b =
if size a <> size b then invalid_arg "CCBloom.union";
let into = copy a in
union_mut ~into b;
into
(*$Q
Q.(pair (list int)(list int)) (fun (l1,l2) -> \
let f1=create 100 and f2 = create 100 in \
add_list f1 l1; add_list f2 l2; \
let f = union f1 f2 in \
List.for_all (fun i -> not (mem f1 i) || mem f i) l1 && \
List.for_all (fun i -> not (mem f2 i) || mem f i) l2)
*)
let inter_mut ~into f =
if size into <> size f then invalid_arg "CCBloom.inter_mut";
Bytes.iteri
(fun i c ->
Bytes.set into.arr i
(Char.chr (Char.code (Bytes.get into.arr i) land (Char.code c)))
) f.arr
let inter a b =
if size a <> size b then invalid_arg "CCBloom.inter";
let into = copy a in
inter_mut ~into b;
into
(*$Q
Q.(pair (list int)(list int)) (fun (l1,l2) -> \
let f1=create 100 and f2 = create 100 in \
add_list f1 l1; add_list f2 l2; \
let f = inter f1 f2 in \
List.for_all (fun i -> not (mem f1 i) || not (mem f2 i) || mem f i) (l1@l2))
*)
let add_list f l = List.iter (add f) l
let add_seq f seq = seq (add f)
let rec add_gen f g = match g() with
| None -> ()
| Some x -> add f x; add_gen f g

79
src/data/CCBloom.mli Normal file
View file

@ -0,0 +1,79 @@
(* This file is free software, part of containers. See file "license" for more details. *)
(** {1 Bloom Filter}
{b status: experimental}
@since NEXT_RELEASE *)
type 'a sequence = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
type 'a hash_funs = ('a -> int) array
(** An array of [k] hash functions on values of type ['a].
Never ever modify such an array after use! *)
val default_hash_funs : int -> 'a hash_funs
(** Use {!Hashtbl.seeded_hash} on [k] seeds
@param k the number of hash functions required *)
(** {2 Bloom Filter} *)
type 'a t
(** Bloom filter containing values of type ['a] *)
val create : ?hash:('a hash_funs) -> int -> 'a t
(** [create ?hash size] creates a filter with given size, and functions.
By default it uses {!default_hash_funs}
@param size a hint for size *)
val create_default : ?hash_len:int -> int -> 'a t
(** [create_default ?hash_len size] is the same as
[create ~hash:(default_hash_funs hash_len) size].
It uses the given number of default hash functions.
@param size a hint for size *)
val copy : 'a t -> 'a t
(** Copy of the filter *)
val size : _ t -> int
(** Length of the underlying array. Do not confuse with a cardinal function,
which is impossible to write for bloom filters *)
val load : _ t -> float
(** Ratio of 1 bits in the underlying array. The closer to [1.], the less
accurate {!mem} is *)
val mem : 'a t -> 'a -> bool
(** [mem f x] tests whether [x] (probably) belongs in [f] *)
val add : 'a t -> 'a -> unit
(** [add f x] adds [x] into [f] *)
val union_mut : into:'a t -> 'a t -> unit
(** [union_mut ~into f] changes [into] into the union of [into] and [f].
[into] and [f] MUST have the same set of hash functions
@raise Invalid_argument if the two sets do not have the same size *)
val union : 'a t -> 'a t -> 'a t
(** the sets MUST have the same set of hash functions
@raise Invalid_argument if the two sets do not have the same size *)
val inter_mut : into:'a t -> 'a t -> unit
(** [inter_mut ~into f] changes [into] into the intersection of [into] and [f]
[into] and [f] MUST have the same set of hash functions
@raise Invalid_argument if the two sets do not have the same size *)
val inter : 'a t -> 'a t -> 'a t
(** the sets MUST have the same set of hash functions
@raise Invalid_argument if the two sets do not have the same size *)
(** {2 Conversions} *)
val add_list : 'a t -> 'a list -> unit
val add_seq : 'a t -> 'a sequence -> unit
val add_gen : 'a t -> 'a gen -> unit

View file

@ -11,7 +11,7 @@
Preliminary benchmarks (see the "tbl" section of benchmarks) tend to show Preliminary benchmarks (see the "tbl" section of benchmarks) tend to show
that this type is quite efficient for small data sets. that this type is quite efficient for small data sets.
{b status: experimental} {b status: unstable}
@since NEXT_RELEASE @since NEXT_RELEASE
*) *)