trie for indexing strings, to be later retrieved using Lenvenshtein automaton,

implemented (with add,remove,retrieve ops)
This commit is contained in:
Simon Cruanes 2014-03-05 00:45:53 +01:00
parent 99dfaecd64
commit 8e5cde6f29
2 changed files with 183 additions and 36 deletions

View file

@ -371,6 +371,18 @@ let rec __find_char ~compare c l = match l with
then next then next
else __find_char ~compare c l' else __find_char ~compare c l'
(* transition for [c] in state [i] of [dfa];
@raise Not_found if no transition matches *)
let __transition dfa i c =
let transitions = DFA.get dfa i in
try
__find_char ~compare:dfa.DFA.compare c transitions
with Not_found ->
let o = DFA.otherwise dfa i in
if o >= 0
then o
else raise Not_found
(* real matching function *) (* real matching function *)
let __match ~len ~get dfa = let __match ~len ~get dfa =
let rec search i state = let rec search i state =
@ -378,17 +390,12 @@ let __match ~len ~get dfa =
if i = len if i = len
then DFA.is_final dfa state then DFA.is_final dfa state
else begin else begin
let transitions = DFA.get dfa state in
(* current char *) (* current char *)
let c = get i in let c = get i in
try try
let next = __find_char ~compare:dfa.DFA.compare c transitions in let next = __transition dfa state c in
search (i+1) next search (i+1) next
with Not_found -> with Not_found -> false
let o = DFA.otherwise dfa state in
if o >= 0
then search (i+1) o
else false
end end
in in
search 0 0 search 0 0
@ -408,31 +415,143 @@ type 'a klist =
| `Cons of 'a * (unit -> 'a klist) | `Cons of 'a * (unit -> 'a klist)
] ]
module Index = struct let rec klist_to_list = function
type ('a, 'b) node = | `Nil -> []
| Empty | `Cons (x,k) -> x :: klist_to_list (k ())
| Node of 'b option * ('a, 'b) assoc_list
and ('a, 'b) assoc_list = ('a * ('a, 'b) node) list
type ('a, 'b) t = { module Index(X : Map.OrderedType) = struct
tree : ('a, 'b) node; type key = X.t
compare : 'a -> 'a -> int;
}
let empty ?(compare=Pervasives.compare) () = { module M = Map.Make(X)
tree = Empty;
compare; type 'b t =
} | Node of 'b option * 'b t M.t
let empty = Node (None, M.empty)
let is_empty = function
| Node (None, m) -> M.is_empty m
| _ -> false
let () = assert (is_empty empty)
(** get/add/remove the leaf for the given array.
the continuation k takes the leaf, and returns a leaf option
that replaces the old leaf.
This function returns the new trie. *)
let goto_leaf ~len ~get node k =
(* insert the value in given [node], assuming the current index
in [arr] is [i]. [k] is given the resulting tree. *)
let rec goto node i rebuild = match node with
| _ when i = len ->
let node' = k node in
rebuild node'
| Node (opt, m) ->
let c = get i in
let t' =
try M.find c m
with Not_found -> empty
in
goto t' (i+1)
(fun t'' ->
if is_empty t''
then rebuild (Node (opt, M.remove c m))
else rebuild (Node (opt, M.add c t'' m)))
in
goto node 0 (fun t -> t)
let __add ~len ~get trie value =
goto_leaf ~len ~get trie
(function
| Node (_, m) -> Node (Some value, m))
let __remove ~len ~get trie value =
goto_leaf ~len ~get trie
(function
| Node (_, m) -> Node (None, m))
(* traverse the automaton and the idx, yielding a klist of values *)
let __retrieve dfa idx =
(* traverse at index i in automaton, with
[fk] the failure continuation *)
let rec traverse node i ~(fk:unit->'a klist) =
match node with
| Node (opt, m) ->
(* all alternatives: continue exploring [m], or call [fk] *)
let fk =
M.fold
(fun c node' fk ->
try
let next = __transition dfa i c in
(fun () -> traverse node' next ~fk)
with Not_found -> fk)
m fk
in
match opt with
| Some v when DFA.is_final dfa i ->
(* yield one solution now *)
`Cons (v, fk)
| _ -> fk () (* fail... or explore subtrees *)
in
traverse idx 0 ~fk:(fun () -> `Nil)
let add idx arr value = let add idx arr value =
assert false (* TODO *) __add ~len:(Array.length arr) ~get:(Array.get arr) idx value
let add_string idx arr str = let remove idx arr value =
assert false (* TODO *) __remove ~len:(Array.length arr) ~get:(Array.get arr) idx value
let retrieve ~limit idx arr = let retrieve ~limit idx arr =
assert false (* TODO *) let automaton = of_array ~compare:X.compare ~limit arr in
__retrieve automaton idx
let of_list l =
List.fold_left
(fun acc (arr,v) -> add acc arr v)
empty l
let __to_list ~of_list idx =
let rec explore acc trail node = match node with
| Node (opt, m) ->
(* first, yield current value, if any *)
let acc = match opt with
| None -> acc
| Some v -> (of_list (List.rev trail), v) :: acc
in
M.fold
(fun c node' acc -> explore acc (c::trail) node')
m acc
in
explore [] [] idx
let to_list idx =
__to_list ~of_list:Array.of_list idx
end
module StrIndex = struct
include Index(Char)
let add_string idx str value =
__add ~len:(String.length str) ~get:(String.get str) idx value
let remove_string idx str value =
__remove ~len:(String.length str) ~get:(String.get str) idx value
let retrieve_string ~limit idx str = let retrieve_string ~limit idx str =
assert false (* TODO *) let automaton = of_string ~limit str in
__retrieve automaton idx
let of_str_list l =
List.fold_left
(fun acc (str,v) -> add_string acc str v)
empty l
let to_str_list idx =
(* clumsy conversion [char list -> string] *)
let of_list l =
let s = String.make (List.length l) ' ' in
List.iteri (fun i c -> s.[i] <- c) l;
s
in
__to_list ~of_list idx
end end

View file

@ -65,24 +65,52 @@ type 'a klist =
| `Cons of 'a * (unit -> 'a klist) | `Cons of 'a * (unit -> 'a klist)
] ]
module Index : sig val klist_to_list : 'a klist -> 'a list
type ('a, 'b) t (** Helper. *)
(** Index that maps 'a strings to values of type 'b. Internally it is
module Index(X : Map.OrderedType) : sig
type key = X.t
type 'b t
(** Index that maps [key] strings to values of type 'b. Internally it is
based on a trie. *) based on a trie. *)
val empty : ?compare:('a -> 'a -> int) -> unit -> ('a, 'b) t val empty : 'b t
(** Empty index, possibly with a specific comparison function *) (** Empty index *)
val add : ('a, 'b) t -> 'a array -> 'b -> ('a, 'b) t val is_empty : _ t -> bool
val add : 'b t -> key array -> 'b -> 'b t
(** Add a char array to the index. If a value was already present (** Add a char array to the index. If a value was already present
for this char it is replaced. *) for this array it is replaced. *)
val add_string : (char, 'b) t -> string -> 'b -> (char, 'b) t val remove : 'b t -> key array -> 'b -> 'b t
(** Add a string to a char index *) (** Remove a char array from the index. *)
val retrieve : limit:int -> ('a, 'b) t -> 'a array -> 'b klist val retrieve : limit:int -> 'b t -> key array -> 'b klist
(** Lazy list of objects associated to strings close to (** Lazy list of objects associated to strings close to
the query string *) the query string *)
val retrieve_string : limit:int -> (char,'b) t -> string -> 'b klist val of_list : (key array * 'b) list -> 'b t
val to_list : 'b t -> (key array * 'b) list
(* TODO sequence/iteration functions *)
end
(** Specific case for strings *)
module StrIndex : sig
include module type of Index(Char)
val add_string : 'b t -> string -> 'b -> 'b t
(** Add a string to a char index *)
val remove_string : 'b t -> string -> 'b -> 'b t
(** Remove a string from a char index *)
val retrieve_string : limit:int -> 'b t -> string -> 'b klist
val of_str_list : (string * 'b) list -> 'b t
val to_str_list : 'b t -> (string * 'b) list
end end