mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 11:15:31 -05:00
trie for indexing strings, to be later retrieved using Lenvenshtein automaton,
implemented (with add,remove,retrieve ops)
This commit is contained in:
parent
99dfaecd64
commit
8e5cde6f29
2 changed files with 183 additions and 36 deletions
169
levenshtein.ml
169
levenshtein.ml
|
|
@ -371,6 +371,18 @@ let rec __find_char ~compare c l = match l with
|
||||||
then next
|
then next
|
||||||
else __find_char ~compare c l'
|
else __find_char ~compare c l'
|
||||||
|
|
||||||
|
(* transition for [c] in state [i] of [dfa];
|
||||||
|
@raise Not_found if no transition matches *)
|
||||||
|
let __transition dfa i c =
|
||||||
|
let transitions = DFA.get dfa i in
|
||||||
|
try
|
||||||
|
__find_char ~compare:dfa.DFA.compare c transitions
|
||||||
|
with Not_found ->
|
||||||
|
let o = DFA.otherwise dfa i in
|
||||||
|
if o >= 0
|
||||||
|
then o
|
||||||
|
else raise Not_found
|
||||||
|
|
||||||
(* real matching function *)
|
(* real matching function *)
|
||||||
let __match ~len ~get dfa =
|
let __match ~len ~get dfa =
|
||||||
let rec search i state =
|
let rec search i state =
|
||||||
|
|
@ -378,17 +390,12 @@ let __match ~len ~get dfa =
|
||||||
if i = len
|
if i = len
|
||||||
then DFA.is_final dfa state
|
then DFA.is_final dfa state
|
||||||
else begin
|
else begin
|
||||||
let transitions = DFA.get dfa state in
|
|
||||||
(* current char *)
|
(* current char *)
|
||||||
let c = get i in
|
let c = get i in
|
||||||
try
|
try
|
||||||
let next = __find_char ~compare:dfa.DFA.compare c transitions in
|
let next = __transition dfa state c in
|
||||||
search (i+1) next
|
search (i+1) next
|
||||||
with Not_found ->
|
with Not_found -> false
|
||||||
let o = DFA.otherwise dfa state in
|
|
||||||
if o >= 0
|
|
||||||
then search (i+1) o
|
|
||||||
else false
|
|
||||||
end
|
end
|
||||||
in
|
in
|
||||||
search 0 0
|
search 0 0
|
||||||
|
|
@ -408,31 +415,143 @@ type 'a klist =
|
||||||
| `Cons of 'a * (unit -> 'a klist)
|
| `Cons of 'a * (unit -> 'a klist)
|
||||||
]
|
]
|
||||||
|
|
||||||
module Index = struct
|
let rec klist_to_list = function
|
||||||
type ('a, 'b) node =
|
| `Nil -> []
|
||||||
| Empty
|
| `Cons (x,k) -> x :: klist_to_list (k ())
|
||||||
| Node of 'b option * ('a, 'b) assoc_list
|
|
||||||
and ('a, 'b) assoc_list = ('a * ('a, 'b) node) list
|
|
||||||
|
|
||||||
type ('a, 'b) t = {
|
module Index(X : Map.OrderedType) = struct
|
||||||
tree : ('a, 'b) node;
|
type key = X.t
|
||||||
compare : 'a -> 'a -> int;
|
|
||||||
}
|
|
||||||
|
|
||||||
let empty ?(compare=Pervasives.compare) () = {
|
module M = Map.Make(X)
|
||||||
tree = Empty;
|
|
||||||
compare;
|
type 'b t =
|
||||||
}
|
| Node of 'b option * 'b t M.t
|
||||||
|
|
||||||
|
let empty = Node (None, M.empty)
|
||||||
|
|
||||||
|
let is_empty = function
|
||||||
|
| Node (None, m) -> M.is_empty m
|
||||||
|
| _ -> false
|
||||||
|
|
||||||
|
let () = assert (is_empty empty)
|
||||||
|
|
||||||
|
(** get/add/remove the leaf for the given array.
|
||||||
|
the continuation k takes the leaf, and returns a leaf option
|
||||||
|
that replaces the old leaf.
|
||||||
|
This function returns the new trie. *)
|
||||||
|
let goto_leaf ~len ~get node k =
|
||||||
|
(* insert the value in given [node], assuming the current index
|
||||||
|
in [arr] is [i]. [k] is given the resulting tree. *)
|
||||||
|
let rec goto node i rebuild = match node with
|
||||||
|
| _ when i = len ->
|
||||||
|
let node' = k node in
|
||||||
|
rebuild node'
|
||||||
|
| Node (opt, m) ->
|
||||||
|
let c = get i in
|
||||||
|
let t' =
|
||||||
|
try M.find c m
|
||||||
|
with Not_found -> empty
|
||||||
|
in
|
||||||
|
goto t' (i+1)
|
||||||
|
(fun t'' ->
|
||||||
|
if is_empty t''
|
||||||
|
then rebuild (Node (opt, M.remove c m))
|
||||||
|
else rebuild (Node (opt, M.add c t'' m)))
|
||||||
|
in
|
||||||
|
goto node 0 (fun t -> t)
|
||||||
|
|
||||||
|
let __add ~len ~get trie value =
|
||||||
|
goto_leaf ~len ~get trie
|
||||||
|
(function
|
||||||
|
| Node (_, m) -> Node (Some value, m))
|
||||||
|
|
||||||
|
let __remove ~len ~get trie value =
|
||||||
|
goto_leaf ~len ~get trie
|
||||||
|
(function
|
||||||
|
| Node (_, m) -> Node (None, m))
|
||||||
|
|
||||||
|
(* traverse the automaton and the idx, yielding a klist of values *)
|
||||||
|
let __retrieve dfa idx =
|
||||||
|
(* traverse at index i in automaton, with
|
||||||
|
[fk] the failure continuation *)
|
||||||
|
let rec traverse node i ~(fk:unit->'a klist) =
|
||||||
|
match node with
|
||||||
|
| Node (opt, m) ->
|
||||||
|
(* all alternatives: continue exploring [m], or call [fk] *)
|
||||||
|
let fk =
|
||||||
|
M.fold
|
||||||
|
(fun c node' fk ->
|
||||||
|
try
|
||||||
|
let next = __transition dfa i c in
|
||||||
|
(fun () -> traverse node' next ~fk)
|
||||||
|
with Not_found -> fk)
|
||||||
|
m fk
|
||||||
|
in
|
||||||
|
match opt with
|
||||||
|
| Some v when DFA.is_final dfa i ->
|
||||||
|
(* yield one solution now *)
|
||||||
|
`Cons (v, fk)
|
||||||
|
| _ -> fk () (* fail... or explore subtrees *)
|
||||||
|
in
|
||||||
|
traverse idx 0 ~fk:(fun () -> `Nil)
|
||||||
|
|
||||||
let add idx arr value =
|
let add idx arr value =
|
||||||
assert false (* TODO *)
|
__add ~len:(Array.length arr) ~get:(Array.get arr) idx value
|
||||||
|
|
||||||
let add_string idx arr str =
|
let remove idx arr value =
|
||||||
assert false (* TODO *)
|
__remove ~len:(Array.length arr) ~get:(Array.get arr) idx value
|
||||||
|
|
||||||
let retrieve ~limit idx arr =
|
let retrieve ~limit idx arr =
|
||||||
assert false (* TODO *)
|
let automaton = of_array ~compare:X.compare ~limit arr in
|
||||||
|
__retrieve automaton idx
|
||||||
|
|
||||||
|
let of_list l =
|
||||||
|
List.fold_left
|
||||||
|
(fun acc (arr,v) -> add acc arr v)
|
||||||
|
empty l
|
||||||
|
|
||||||
|
let __to_list ~of_list idx =
|
||||||
|
let rec explore acc trail node = match node with
|
||||||
|
| Node (opt, m) ->
|
||||||
|
(* first, yield current value, if any *)
|
||||||
|
let acc = match opt with
|
||||||
|
| None -> acc
|
||||||
|
| Some v -> (of_list (List.rev trail), v) :: acc
|
||||||
|
in
|
||||||
|
M.fold
|
||||||
|
(fun c node' acc -> explore acc (c::trail) node')
|
||||||
|
m acc
|
||||||
|
in
|
||||||
|
explore [] [] idx
|
||||||
|
|
||||||
|
let to_list idx =
|
||||||
|
__to_list ~of_list:Array.of_list idx
|
||||||
|
end
|
||||||
|
|
||||||
|
module StrIndex = struct
|
||||||
|
include Index(Char)
|
||||||
|
|
||||||
|
let add_string idx str value =
|
||||||
|
__add ~len:(String.length str) ~get:(String.get str) idx value
|
||||||
|
|
||||||
|
let remove_string idx str value =
|
||||||
|
__remove ~len:(String.length str) ~get:(String.get str) idx value
|
||||||
|
|
||||||
let retrieve_string ~limit idx str =
|
let retrieve_string ~limit idx str =
|
||||||
assert false (* TODO *)
|
let automaton = of_string ~limit str in
|
||||||
|
__retrieve automaton idx
|
||||||
|
|
||||||
|
let of_str_list l =
|
||||||
|
List.fold_left
|
||||||
|
(fun acc (str,v) -> add_string acc str v)
|
||||||
|
empty l
|
||||||
|
|
||||||
|
let to_str_list idx =
|
||||||
|
(* clumsy conversion [char list -> string] *)
|
||||||
|
let of_list l =
|
||||||
|
let s = String.make (List.length l) ' ' in
|
||||||
|
List.iteri (fun i c -> s.[i] <- c) l;
|
||||||
|
s
|
||||||
|
in
|
||||||
|
__to_list ~of_list idx
|
||||||
end
|
end
|
||||||
|
|
|
||||||
|
|
@ -65,24 +65,52 @@ type 'a klist =
|
||||||
| `Cons of 'a * (unit -> 'a klist)
|
| `Cons of 'a * (unit -> 'a klist)
|
||||||
]
|
]
|
||||||
|
|
||||||
module Index : sig
|
val klist_to_list : 'a klist -> 'a list
|
||||||
type ('a, 'b) t
|
(** Helper. *)
|
||||||
(** Index that maps 'a strings to values of type 'b. Internally it is
|
|
||||||
|
module Index(X : Map.OrderedType) : sig
|
||||||
|
type key = X.t
|
||||||
|
|
||||||
|
type 'b t
|
||||||
|
(** Index that maps [key] strings to values of type 'b. Internally it is
|
||||||
based on a trie. *)
|
based on a trie. *)
|
||||||
|
|
||||||
val empty : ?compare:('a -> 'a -> int) -> unit -> ('a, 'b) t
|
val empty : 'b t
|
||||||
(** Empty index, possibly with a specific comparison function *)
|
(** Empty index *)
|
||||||
|
|
||||||
val add : ('a, 'b) t -> 'a array -> 'b -> ('a, 'b) t
|
val is_empty : _ t -> bool
|
||||||
|
|
||||||
|
val add : 'b t -> key array -> 'b -> 'b t
|
||||||
(** Add a char array to the index. If a value was already present
|
(** Add a char array to the index. If a value was already present
|
||||||
for this char it is replaced. *)
|
for this array it is replaced. *)
|
||||||
|
|
||||||
val add_string : (char, 'b) t -> string -> 'b -> (char, 'b) t
|
val remove : 'b t -> key array -> 'b -> 'b t
|
||||||
(** Add a string to a char index *)
|
(** Remove a char array from the index. *)
|
||||||
|
|
||||||
val retrieve : limit:int -> ('a, 'b) t -> 'a array -> 'b klist
|
val retrieve : limit:int -> 'b t -> key array -> 'b klist
|
||||||
(** Lazy list of objects associated to strings close to
|
(** Lazy list of objects associated to strings close to
|
||||||
the query string *)
|
the query string *)
|
||||||
|
|
||||||
val retrieve_string : limit:int -> (char,'b) t -> string -> 'b klist
|
val of_list : (key array * 'b) list -> 'b t
|
||||||
|
|
||||||
|
val to_list : 'b t -> (key array * 'b) list
|
||||||
|
|
||||||
|
(* TODO sequence/iteration functions *)
|
||||||
|
end
|
||||||
|
|
||||||
|
(** Specific case for strings *)
|
||||||
|
module StrIndex : sig
|
||||||
|
include module type of Index(Char)
|
||||||
|
|
||||||
|
val add_string : 'b t -> string -> 'b -> 'b t
|
||||||
|
(** Add a string to a char index *)
|
||||||
|
|
||||||
|
val remove_string : 'b t -> string -> 'b -> 'b t
|
||||||
|
(** Remove a string from a char index *)
|
||||||
|
|
||||||
|
val retrieve_string : limit:int -> 'b t -> string -> 'b klist
|
||||||
|
|
||||||
|
val of_str_list : (string * 'b) list -> 'b t
|
||||||
|
|
||||||
|
val to_str_list : 'b t -> (string * 'b) list
|
||||||
end
|
end
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue